From ace56d41aca8cac7cead9c2c97278aa50fc945b1 Mon Sep 17 00:00:00 2001 From: Maxim Kuvyrkov Date: Thu, 18 Mar 2021 07:42:41 +0000 Subject: [PATCH 0001/1000] [WoA][MSVC] Use default linker setting in MSVC-compatible driver At the moment "link.exe" is hard-coded as default linker in MSVC.cpp, so there's no way to use LLD as default linker for MSVC driver. This patch adds checking of CLANG_DEFAULT_LINKER to MSVC.cpp. Reviewed By: asl Differential Revision: https://reviews.llvm.org/D98493 --- clang/lib/Driver/ToolChains/MSVC.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 96de02378ca2..38ad7125b4af 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -11,6 +11,7 @@ #include "Darwin.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Version.h" +#include "clang/Config/config.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" @@ -577,7 +578,10 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA, // translate 'lld' into 'lld-link', and in the case of the regular msvc // linker, we need to use a special search algorithm. llvm::SmallString<128> linkPath; - StringRef Linker = Args.getLastArgValue(options::OPT_fuse_ld_EQ, "link"); + StringRef Linker = Args.getLastArgValue(options::OPT_fuse_ld_EQ, + CLANG_DEFAULT_LINKER); + if (Linker.empty()) + Linker = "link"; if (Linker.equals_lower("lld")) Linker = "lld-link"; -- GitLab From 6802fdf8871f69d52b06d0a2b7f62f3af8292690 Mon Sep 17 00:00:00 2001 From: Maxim Kuvyrkov Date: Thu, 18 Mar 2021 07:47:16 +0000 Subject: [PATCH 0002/1000] [NFC] Minor cleanup to use default setting of getLastArg() Noticed this while I was looking at linker defaults. Reviewed By: asl Differential Revision: https://reviews.llvm.org/D98494 --- clang/lib/Driver/ToolChain.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 217ba56c3351..94ae40e1e65f 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -571,8 +571,8 @@ std::string ToolChain::GetLinkerPath(bool *LinkerIsLLD, // Get -fuse-ld= first to prevent -Wunused-command-line-argument. -fuse-ld= is // considered as the linker flavor, e.g. "bfd", "gold", or "lld". - const Arg* A = Args.getLastArg(options::OPT_fuse_ld_EQ); - StringRef UseLinker = A ? A->getValue() : CLANG_DEFAULT_LINKER; + StringRef UseLinker = Args.getLastArgValue(options::OPT_fuse_ld_EQ, + CLANG_DEFAULT_LINKER); // --ld-path= takes precedence over -fuse-ld= and specifies the executable // name. -B, COMPILER_PATH and PATH and consulted if the value does not -- GitLab From 1ce70c15ed3b9c84d6d73abd74f6605bccdf2e7b Mon Sep 17 00:00:00 2001 From: Frederik Gossen Date: Thu, 18 Mar 2021 08:58:59 +0100 Subject: [PATCH 0003/1000] [MLIR] Canonicalize broadcast operations on single shapes This covers cases that are not folded away because the extent tensor type becomes more concrete in the process. Differential Revision: https://reviews.llvm.org/D98782 --- mlir/lib/Dialect/Shape/IR/Shape.cpp | 17 ++++++++++++++++- mlir/test/Dialect/Shape/canonicalize.mlir | 12 ++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp index ed8dcfc13549..33719951f3e9 100644 --- a/mlir/lib/Dialect/Shape/IR/Shape.cpp +++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp @@ -414,11 +414,26 @@ struct RemoveDuplicateOperandsPattern : public OpRewritePattern { return failure(); } }; + +struct BroadcastForwardSingleOperandPattern + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(BroadcastOp op, + PatternRewriter &rewriter) const override { + if (op.getNumOperands() == 1) { + rewriter.replaceOp(op, op.shapes().front()); + return success(); + } + return failure(); + } +}; } // namespace void BroadcastOp::getCanonicalizationPatterns( OwningRewritePatternList &patterns, MLIRContext *context) { - patterns.insert>(context); + patterns.insert>(context); } //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/Shape/canonicalize.mlir b/mlir/test/Dialect/Shape/canonicalize.mlir index 53f27e4839cf..3399fe0f4e23 100644 --- a/mlir/test/Dialect/Shape/canonicalize.mlir +++ b/mlir/test/Dialect/Shape/canonicalize.mlir @@ -1119,3 +1119,15 @@ func @broadcast_on_duplicate_shapes(%a : !shape.shape, %b : !shape.shape) !shape.shape, !shape.shape, !shape.shape, !shape.shape -> !shape.shape return %0 : !shape.shape } + +// ----- + +// CHECK-LABEL: @broadcast_on_single_operand +// CHECK-SAME: (%[[A:.*]]: tensor<3xindex>) +func @broadcast_on_single_operand(%a : tensor<3xindex>) { + // CHECK-NOT: broadcast + // CHECK: "use"(%[[A]]) + %0 = shape.broadcast %a : tensor<3xindex> -> tensor + "use"(%0) : (tensor) -> () + return +} -- GitLab From 62948c4532d59b59f63409eae5d7f9e4990e5626 Mon Sep 17 00:00:00 2001 From: Maxim Kuvyrkov Date: Thu, 18 Mar 2021 08:05:14 +0000 Subject: [PATCH 0004/1000] Revert "[NFC] Minor cleanup to use default setting of getLastArg()" The patch was wrong. We use "const Arg *A" at the end of GetLinkerPath, so can't remove it. This reverts commit 6802fdf8871f69d52b06d0a2b7f62f3af8292690. --- clang/lib/Driver/ToolChain.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 94ae40e1e65f..217ba56c3351 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -571,8 +571,8 @@ std::string ToolChain::GetLinkerPath(bool *LinkerIsLLD, // Get -fuse-ld= first to prevent -Wunused-command-line-argument. -fuse-ld= is // considered as the linker flavor, e.g. "bfd", "gold", or "lld". - StringRef UseLinker = Args.getLastArgValue(options::OPT_fuse_ld_EQ, - CLANG_DEFAULT_LINKER); + const Arg* A = Args.getLastArg(options::OPT_fuse_ld_EQ); + StringRef UseLinker = A ? A->getValue() : CLANG_DEFAULT_LINKER; // --ld-path= takes precedence over -fuse-ld= and specifies the executable // name. -B, COMPILER_PATH and PATH and consulted if the value does not -- GitLab From 3d0aed79362de001bf010ae027f099a177ed19ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20B=C3=B6ck?= Date: Thu, 18 Mar 2021 09:24:49 +0100 Subject: [PATCH 0005/1000] [CMake] Use compiler-rt location instead of resource directory to find clang-cls runtime directory The current cmake script attempts to add the path containing clangs various runtime systems by getting the resource directory and then appending the hardcoded value /lib/windows to it. This works for a normal clang-cl build but fails for a build of clang using LLVM_ENABLE_PER_TARGET_RUNTIME_DIR, such as the builds from llvm/runtimes. This patch instead uses -print-libgcc-file-name in conjunction with --rtlib=compiler-rt, and instead adds the containing directory as library path. For non per-target runtime directory builds, such as the release builds, there is no change. Even if the builtins library were to be deleted or moved it would output the same path as before. For per-target runtime builds that also have the builtins library, this now finds the correct directory containing all of clang runtime libraries. Only case still not handled by this change, is if a per-target runtime directory build is used, but the builtins library was not built. I believe that is the best we can do for now however, without modifying clang. Differential Revision: https://reviews.llvm.org/D98786 --- llvm/cmake/modules/HandleLLVMOptions.cmake | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 2e088bd6e916..d85fe137c191 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -982,8 +982,8 @@ endif() # linker directly, it isn't sufficient to pass -fsanitize=* to the linker. if (CLANG_CL AND (LLVM_BUILD_INSTRUMENTED OR LLVM_USE_SANITIZER)) execute_process( - COMMAND ${CMAKE_CXX_COMPILER} /clang:-print-resource-dir - OUTPUT_VARIABLE clang_resource_dir + COMMAND ${CMAKE_CXX_COMPILER} /clang:-print-libgcc-file-name /clang:--rtlib=compiler-rt + OUTPUT_VARIABLE clang_compiler_rt_file ERROR_VARIABLE clang_cl_stderr OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE @@ -992,8 +992,9 @@ if (CLANG_CL AND (LLVM_BUILD_INSTRUMENTED OR LLVM_USE_SANITIZER)) message(FATAL_ERROR "Unable to invoke clang-cl to find resource dir: ${clang_cl_stderr}") endif() - file(TO_CMAKE_PATH "${clang_resource_dir}" clang_resource_dir) - append("/libpath:${clang_resource_dir}/lib/windows" + file(TO_CMAKE_PATH "${clang_compiler_rt_file}" clang_compiler_rt_file) + get_filename_component(clang_runtime_dir "${clang_compiler_rt_file}" DIRECTORY) + append("/libpath:${clang_runtime_dir}" CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS) -- GitLab From 90ecb862a003d581136842dcdc213315727d50e2 Mon Sep 17 00:00:00 2001 From: Sjoerd Meijer Date: Tue, 16 Mar 2021 11:53:43 +0000 Subject: [PATCH 0006/1000] [AArch64] Rewrite (add, csel) to cinc Don't rewrite an add instruction with 2 SET_CC operands into a csel instruction. The total instruction sequence uses an extra instruction and register. Preventing this allows us to match a `(add, csel)` pattern and rewrite this into a `cinc`. Differential Revision: https://reviews.llvm.org/D98704 --- .../Target/AArch64/AArch64ISelLowering.cpp | 7 ++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 5 +++ llvm/test/CodeGen/AArch64/arm64-csel.ll | 41 +++++++++++++++++++ llvm/test/CodeGen/AArch64/half.ll | 12 +++--- 4 files changed, 59 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3c823f5ac522..e3c928e1b79b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -13190,6 +13190,13 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { SDValue RHS = Op->getOperand(1); SetCCInfoAndKind InfoAndKind; + // If both operands are a SET_CC, then we don't want to perform this + // folding and create another csel as this results in more instructions + // (and higher register usage). + if (isSetCCOrZExtSetCC(LHS, InfoAndKind) && + isSetCCOrZExtSetCC(RHS, InfoAndKind)) + return SDValue(); + // If neither operand is a SET_CC, give up. if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { std::swap(LHS, RHS); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index d5dd0ae99463..338963fec616 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -2162,6 +2162,11 @@ def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV), def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV), (CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>; +def : Pat<(add GPR32:$val, (AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV)), + (CSINCWr GPR32:$val, GPR32:$val, (i32 imm:$cc))>; +def : Pat<(add GPR64:$val, (zext (AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV))), + (CSINCXr GPR64:$val, GPR64:$val, (i32 imm:$cc))>; + // The inverse of the condition code from the alias instruction is what is used // in the aliased instruction. The parser all ready inverts the condition code // for these aliases. diff --git a/llvm/test/CodeGen/AArch64/arm64-csel.ll b/llvm/test/CodeGen/AArch64/arm64-csel.ll index f031710a4dcb..44e951ed69e1 100644 --- a/llvm/test/CodeGen/AArch64/arm64-csel.ll +++ b/llvm/test/CodeGen/AArch64/arm64-csel.ll @@ -269,3 +269,44 @@ define i64 @foo23(i64 %x) { %res = select i1 %cmp, i64 1, i64 6 ret i64 %res } + +define i16 @foo24(i8* nocapture readonly %A, i8* nocapture readonly %B) { +; CHECK-LABEL: foo24: +; CHECK: ldrb w[[W8:[0-9]+]], [x1] +; CHECK-NEXT: ldrb w[[W9:[0-9]+]], [x0] +; CHECK-NEXT: cmp w[[W8]], #33 +; CHECK-NEXT: cset w[[W8]], hi +; CHECK-NEXT: cmp w[[W9]], #3 +; CHECK-NEXT: cinc w0, w[[W8]], hi +; CHECK-NEXT: ret +entry: + %0 = load i8, i8* %A, align 1 + %cmp = icmp ugt i8 %0, 3 + %conv1 = zext i1 %cmp to i16 + %1 = load i8, i8* %B, align 1 + %cmp4 = icmp ugt i8 %1, 33 + %conv5 = zext i1 %cmp4 to i16 + %add = add nuw nsw i16 %conv5, %conv1 + ret i16 %add +} + +define i64 @foo25(i64* nocapture readonly %A, i64* nocapture readonly %B) { +; CHECK-LABEL: foo25: +; CHECK: ldr x[[X8:[0-9]+]], [x1] +; CHECK-NEXT: ldr x[[X9:[0-9]+]], [x0] +; CHECK-NEXT: cmp x[[X8]], #33 +; CHECK-NEXT: cset w[[W8]], hi +; CHECK-NEXT: cmp x[[X9]], #3 +; CHECK-NEXT: cinc x0, x[[X8]], hi +; CHECK-NEXT: ret +entry: + %0 = load i64, i64* %A, align 1 + %cmp = icmp ugt i64 %0, 3 + %conv1 = zext i1 %cmp to i64 + %1 = load i64, i64* %B, align 1 + %cmp4 = icmp ugt i64 %1, 33 + %conv5 = zext i1 %cmp4 to i64 + %add = add nuw nsw i64 %conv5, %conv1 + ret i64 %add +} + diff --git a/llvm/test/CodeGen/AArch64/half.ll b/llvm/test/CodeGen/AArch64/half.ll index b815c53d02bc..ab64cc04374f 100644 --- a/llvm/test/CodeGen/AArch64/half.ll +++ b/llvm/test/CodeGen/AArch64/half.ll @@ -107,12 +107,12 @@ define i16 @test_fccmp(i1 %a, i16 %in) { ; CHECK-NEXT: movk w9, #15428, lsl #16 ; CHECK-NEXT: fmov s1, w8 ; CHECK-NEXT: fcmp s0, s1 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: cset w8, pl -; CHECK-NEXT: fccmp s0, s1, #8, pl -; CHECK-NEXT: mov w9, #4 -; CHECK-NEXT: csinc w9, w9, wzr, mi -; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: mov w10, #4 +; CHECK-NEXT: fccmp s0, s2, #8, pl +; CHECK-NEXT: csinc w8, w10, wzr, mi +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cinc w0, w8, pl ; CHECK-NEXT: ret %f16 = bitcast i16 %in to half %cmp0 = fcmp ogt half 0xH3333, %f16 -- GitLab From e64adc0b88c2705425a9fe2345729e2688a4e4c6 Mon Sep 17 00:00:00 2001 From: "Luo, Yuanke" Date: Wed, 17 Mar 2021 19:17:18 +0800 Subject: [PATCH 0007/1000] [X86] Fix compile time regression of D93594. D93594 depend on the dominate tree and loop information. It increased the compile time when build with -O0. However this is just to amend the dominate tree and loop information, so that it is unnecessary to re-analyze them again. Given the dominate tree of loop information are absent in this pass, we can avoid amending them. Differential Revision: https://reviews.llvm.org/D98773 --- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 166 ++++++++++-------- llvm/test/CodeGen/X86/O0-pipeline.ll | 2 - llvm/test/CodeGen/X86/opt-pipeline.ll | 4 +- 3 files changed, 90 insertions(+), 82 deletions(-) diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 9b6e54682f8c..134df5d9569c 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -50,10 +50,38 @@ static bool isV256I32Ty(Type *Ty) { return false; } -static BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, - Value *Bound, Value *Step, StringRef Name, - IRBuilderBase &B, DomTreeUpdater &DTU, Loop *L, - LoopInfo &LI) { +namespace { +class X86LowerAMXIntrinsics { + Function &Func; + +public: + X86LowerAMXIntrinsics(Function &F, DomTreeUpdater &DomTU, LoopInfo *LoopI) + : Func(F), DTU(DomTU), LI(LoopI) {} + bool visit(); + +private: + DomTreeUpdater &DTU; + LoopInfo *LI; + BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, Value *Bound, + Value *Step, StringRef Name, IRBuilderBase &B, + Loop *L); + template + Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End, + IRBuilderBase &B, Value *Row, Value *Col, + Value *Ptr, Value *Stride, Value *Tile); + Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End, + IRBuilderBase &B, Value *Row, Value *Col, + Value *K, Value *Acc, Value *LHS, Value *RHS); + template + bool lowerTileLoadStore(Instruction *TileLoadStore); + bool lowerTileDPBSSD(Instruction *TileDPBSSD); + bool lowerTileZero(Instruction *TileZero); +}; + +BasicBlock *X86LowerAMXIntrinsics::createLoop(BasicBlock *Preheader, + BasicBlock *Exit, Value *Bound, + Value *Step, StringRef Name, + IRBuilderBase &B, Loop *L) { LLVMContext &Ctx = Preheader->getContext(); BasicBlock *Header = BasicBlock::Create(Ctx, Name + ".header", Preheader->getParent(), Exit); @@ -86,35 +114,37 @@ static BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, {DominatorTree::Insert, Latch, Exit}, {DominatorTree::Insert, Preheader, Header}, }); - - L->addBasicBlockToLoop(Header, LI); - L->addBasicBlockToLoop(Body, LI); - L->addBasicBlockToLoop(Latch, LI); + if (LI) { + L->addBasicBlockToLoop(Header, *LI); + L->addBasicBlockToLoop(Body, *LI); + L->addBasicBlockToLoop(Latch, *LI); + } return Body; } template -static Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End, - IRBuilderBase &B, DomTreeUpdater &DTU, - LoopInfo &LI, Value *Row, Value *Col, - Value *Ptr, Value *Stride, Value *Tile) { +Value *X86LowerAMXIntrinsics::createTileLoadStoreLoops( + BasicBlock *Start, BasicBlock *End, IRBuilderBase &B, Value *Row, + Value *Col, Value *Ptr, Value *Stride, Value *Tile) { std::string IntrinName = IsTileLoad ? "tileload" : "tilestore"; - Loop *RowLoop = LI.AllocateLoop(); - Loop *ColLoop = LI.AllocateLoop(); - RowLoop->addChildLoop(ColLoop); - if (Loop *ParentL = LI.getLoopFor(Start)) - ParentL->addChildLoop(RowLoop); - else - LI.addTopLevelLoop(RowLoop); + Loop *RowLoop = nullptr; + Loop *ColLoop = nullptr; + if (LI) { + RowLoop = LI->AllocateLoop(); + ColLoop = LI->AllocateLoop(); + RowLoop->addChildLoop(ColLoop); + if (Loop *ParentL = LI->getLoopFor(Start)) + ParentL->addChildLoop(RowLoop); + else + LI->addTopLevelLoop(RowLoop); + } - BasicBlock *RowBody = - createLoop(Start, End, Row, B.getInt16(1), IntrinName + ".scalarize.rows", - B, DTU, RowLoop, LI); + BasicBlock *RowBody = createLoop(Start, End, Row, B.getInt16(1), + IntrinName + ".scalarize.rows", B, RowLoop); BasicBlock *RowLatch = RowBody->getSingleSuccessor(); - BasicBlock *ColBody = - createLoop(RowBody, RowLatch, Col, B.getInt16(1), - IntrinName + ".scalarize.cols", B, DTU, ColLoop, LI); + BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col, B.getInt16(1), + IntrinName + ".scalarize.cols", B, ColLoop); BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor(); BasicBlock *ColLoopHeader = ColBody->getSinglePredecessor(); @@ -181,35 +211,36 @@ static Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End, } } -static Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End, - IRBuilderBase &B, DomTreeUpdater &DTU, - LoopInfo &LI, Value *Row, Value *Col, - Value *K, Value *Acc, Value *LHS, - Value *RHS) { - Loop *RowLoop = LI.AllocateLoop(); - Loop *ColLoop = LI.AllocateLoop(); - Loop *InnerLoop = LI.AllocateLoop(); - ColLoop->addChildLoop(InnerLoop); - RowLoop->addChildLoop(ColLoop); - if (Loop *ParentL = LI.getLoopFor(Start)) - ParentL->addChildLoop(RowLoop); - else - LI.addTopLevelLoop(RowLoop); +Value *X86LowerAMXIntrinsics::createTileDPBSSDLoops( + BasicBlock *Start, BasicBlock *End, IRBuilderBase &B, Value *Row, + Value *Col, Value *K, Value *Acc, Value *LHS, Value *RHS) { + Loop *RowLoop = nullptr; + Loop *ColLoop = nullptr; + Loop *InnerLoop = nullptr; + if (LI) { + RowLoop = LI->AllocateLoop(); + ColLoop = LI->AllocateLoop(); + InnerLoop = LI->AllocateLoop(); + ColLoop->addChildLoop(InnerLoop); + RowLoop->addChildLoop(ColLoop); + if (Loop *ParentL = LI->getLoopFor(Start)) + ParentL->addChildLoop(RowLoop); + else + LI->addTopLevelLoop(RowLoop); + } - BasicBlock *RowBody = - createLoop(Start, End, Row, B.getInt16(1), "tiledpbssd.scalarize.rows", B, - DTU, RowLoop, LI); + BasicBlock *RowBody = createLoop(Start, End, Row, B.getInt16(1), + "tiledpbssd.scalarize.rows", B, RowLoop); BasicBlock *RowLatch = RowBody->getSingleSuccessor(); - BasicBlock *ColBody = - createLoop(RowBody, RowLatch, Col, B.getInt16(1), - "tiledpbssd.scalarize.cols", B, DTU, ColLoop, LI); + BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col, B.getInt16(1), + "tiledpbssd.scalarize.cols", B, ColLoop); BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor(); B.SetInsertPoint(ColBody->getTerminator()); BasicBlock *InnerBody = createLoop(ColBody, ColLoopLatch, K, B.getInt16(1), - "tiledpbssd.scalarize.inner", B, DTU, InnerLoop, LI); + "tiledpbssd.scalarize.inner", B, InnerLoop); BasicBlock *ColLoopHeader = ColBody->getSinglePredecessor(); BasicBlock *RowLoopHeader = RowBody->getSinglePredecessor(); @@ -324,30 +355,11 @@ static Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End, return NewVecD; } -namespace { -class X86LowerAMXIntrinsics { - Function &Func; - -public: - X86LowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) - : Func(F), DT(DT), LI(LI) {} - bool visit(); - -private: - DominatorTree *DT; - LoopInfo *LI; - template - bool lowerTileLoadStore(Instruction *TileLoadStore); - bool lowerTileDPBSSD(Instruction *TileDPBSSD); - bool lowerTileZero(Instruction *TileZero); -}; - bool X86LowerAMXIntrinsics::lowerTileDPBSSD(Instruction *TileDPBSSD) { Value *M, *N, *K, *C, *A, *B; match(TileDPBSSD, m_Intrinsic( m_Value(M), m_Value(N), m_Value(K), m_Value(C), m_Value(A), m_Value(B))); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Instruction *InsertI = TileDPBSSD; IRBuilder<> PreBuilder(TileDPBSSD); PreBuilder.SetInsertPoint(TileDPBSSD); @@ -358,10 +370,10 @@ bool X86LowerAMXIntrinsics::lowerTileDPBSSD(Instruction *TileDPBSSD) { Value *KDWord = PreBuilder.CreateLShr(K, PreBuilder.getInt16(2)); BasicBlock *Start = InsertI->getParent(); BasicBlock *End = - SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue"); + SplitBlock(InsertI->getParent(), InsertI, &DTU, LI, nullptr, "continue"); IRBuilder<> Builder(TileDPBSSD); - Value *ResVec = createTileDPBSSDLoops(Start, End, Builder, DTU, *LI, M, - NDWord, KDWord, C, A, B); + Value *ResVec = + createTileDPBSSDLoops(Start, End, Builder, M, NDWord, KDWord, C, A, B); // we cannot assume there always be bitcast after tiledpbssd. So we need to // insert one bitcast as required Builder.SetInsertPoint(End->getFirstNonPHI()); @@ -394,7 +406,6 @@ bool X86LowerAMXIntrinsics::lowerTileLoadStore(Instruction *TileLoadStore) { m_Value(M), m_Value(N), m_Value(Ptr), m_Value(Stride), m_Value(Tile))); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Instruction *InsertI = TileLoadStore; IRBuilder<> PreBuilder(TileLoadStore); PreBuilder.SetInsertPoint(TileLoadStore); @@ -402,10 +413,10 @@ bool X86LowerAMXIntrinsics::lowerTileLoadStore(Instruction *TileLoadStore) { Value *StrideDWord = PreBuilder.CreateLShr(Stride, PreBuilder.getInt64(2)); BasicBlock *Start = InsertI->getParent(); BasicBlock *End = - SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue"); + SplitBlock(InsertI->getParent(), InsertI, &DTU, LI, nullptr, "continue"); IRBuilder<> Builder(TileLoadStore); Value *ResVec = createTileLoadStoreLoops( - Start, End, Builder, DTU, *LI, M, NDWord, Ptr, StrideDWord, + Start, End, Builder, M, NDWord, Ptr, StrideDWord, IsTileLoad ? nullptr : Tile); if (IsTileLoad) { // we cannot assume there always be bitcast after tileload. So we need to @@ -505,18 +516,19 @@ public: TM->getOptLevel() != CodeGenOpt::None) return false; - auto &DT = getAnalysis().getDomTree(); - auto &LI = getAnalysis().getLoopInfo(); + auto *DTWP = getAnalysisIfAvailable(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = getAnalysisIfAvailable(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - X86LowerAMXIntrinsics LAT(F, &DT, &LI); + X86LowerAMXIntrinsics LAT(F, DTU, LI); return LAT.visit(); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); AU.addPreserved(); - AU.addRequired(); AU.addPreserved(); AU.addRequired(); } @@ -528,8 +540,6 @@ static const char PassName[] = "Lower AMX intrinsics"; char X86LowerAMXIntrinsicsLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll index 2e1cbacf4584..e5b3584a0339 100644 --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -18,8 +18,6 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store ; CHECK-NEXT: Module Verifier diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll index 0f92e5a7be06..9df12b7a3fd3 100644 --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -24,12 +24,12 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store ; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager -- GitLab From c2b4600ec8812decfd91fd66c3db862b0fbaa6ff Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 17 Mar 2021 12:33:59 +0000 Subject: [PATCH 0008/1000] [RISCV] Support bitcasts of fixed-length mask vectors Without this patch, bitcasts of fixed-length mask vectors would go through the stack. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D98779 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 4 ++-- .../CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 4856f2c5219e..b54e2ce73fd1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -558,6 +558,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::BITCAST, VT, Custom); + // Operations below are different for between masks and other vectors. if (VT.getVectorElementType() == MVT::i1) { setOperationAction(ISD::AND, VT, Custom); @@ -605,8 +607,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND, VT, Custom); setOperationAction(ISD::ZERO_EXTEND, VT, Custom); - setOperationAction(ISD::BITCAST, VT, Custom); - // Custom-lower reduction operations to set up the corresponding custom // nodes' operands. setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll new file mode 100644 index 000000000000..53fe40a707b5 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitcast.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 < %s | FileCheck %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 < %s | FileCheck %s + +define <32 x i1> @bitcast_v4i8_v32i1(<4 x i8> %a, <32 x i1> %b) { +; CHECK-LABEL: bitcast_v4i8_v32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, zero, 32 +; CHECK-NEXT: vsetvli a0, a0, e8,m2,ta,mu +; CHECK-NEXT: vmxor.mm v0, v0, v8 +; CHECK-NEXT: ret + %c = bitcast <4 x i8> %a to <32 x i1> + %d = xor <32 x i1> %b, %c + ret <32 x i1> %d +} -- GitLab From f1a7d5a7b0ec810057ff6e88371ab86d1fce812c Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Mon, 15 Mar 2021 22:00:07 +0300 Subject: [PATCH 0009/1000] [-Wcalled-once-parameter] Harden analysis in terms of block use This patch introduces a very simple inter-procedural analysis between blocks and enclosing functions. We always analyze blocks first (analysis is done as part of semantic analysis that goes side-by-side with the parsing process), and at the moment of reporting we don't know how that block will be actually used. This patch introduces new logic delaying reports of the "never called" warnings on blocks. If we are not sure that the block will be called exactly once, we shouldn't warn our users about that. Double calls, however, don't require such delays. While analyzing the enclosing function, we can actually decide what we should do with those warnings. Additionally, as a side effect, we can be more confident about blocks in such context and can treat them not as escapes, but as direct calls. rdar://74090107 Differential Revision: https://reviews.llvm.org/D98688 --- .../clang/Analysis/Analyses/CalledOnceCheck.h | 17 +- .../clang/Sema/AnalysisBasedWarnings.h | 8 +- clang/lib/Analysis/CalledOnceCheck.cpp | 151 +++++++++++++++-- clang/lib/Sema/AnalysisBasedWarnings.cpp | 156 ++++++++++++------ clang/test/SemaObjC/warn-called-once.m | 53 +++++- 5 files changed, 314 insertions(+), 71 deletions(-) diff --git a/clang/include/clang/Analysis/Analyses/CalledOnceCheck.h b/clang/include/clang/Analysis/Analyses/CalledOnceCheck.h index fc574c680a44..a0c767bf92d2 100644 --- a/clang/include/clang/Analysis/Analyses/CalledOnceCheck.h +++ b/clang/include/clang/Analysis/Analyses/CalledOnceCheck.h @@ -17,6 +17,7 @@ namespace clang { class AnalysisDeclContext; +class BlockDecl; class CFG; class Decl; class DeclContext; @@ -79,6 +80,7 @@ public: /// the path containing the call and not containing the call. This helps us /// to pinpoint a bad path for the user. /// \param Parameter -- parameter that should be called once. + /// \param Function -- function declaration where the problem occured. /// \param Where -- the least common ancestor statement. /// \param Reason -- a reason describing the path without a call. /// \param IsCalledDirectly -- true, if parameter actually gets called on @@ -86,9 +88,22 @@ public: /// collection, passed as a parameter, etc.). /// \param IsCompletionHandler -- true, if parameter is a completion handler. virtual void handleNeverCalled(const ParmVarDecl *Parameter, - const Stmt *Where, NeverCalledReason Reason, + const Decl *Function, const Stmt *Where, + NeverCalledReason Reason, bool IsCalledDirectly, bool IsCompletionHandler) {} + + /// Called when the block is guaranteed to be called exactly once. + /// It means that we can be stricter with what we report on that block. + /// \param Block -- block declaration that is known to be called exactly once. + virtual void + handleBlockThatIsGuaranteedToBeCalledOnce(const BlockDecl *Block) {} + + /// Called when the block has no guarantees about how many times it can get + /// called. + /// It means that we should be more lenient with reporting warnings in it. + /// \param Block -- block declaration in question. + virtual void handleBlockWithNoGuarantees(const BlockDecl *Block) {} }; /// Check given CFG for 'called once' parameter violations. diff --git a/clang/include/clang/Sema/AnalysisBasedWarnings.h b/clang/include/clang/Sema/AnalysisBasedWarnings.h index e13fe955eaf4..49b69c585ff7 100644 --- a/clang/include/clang/Sema/AnalysisBasedWarnings.h +++ b/clang/include/clang/Sema/AnalysisBasedWarnings.h @@ -14,6 +14,7 @@ #define LLVM_CLANG_SEMA_ANALYSISBASEDWARNINGS_H #include "llvm/ADT/DenseMap.h" +#include namespace clang { @@ -47,6 +48,9 @@ private: Sema &S; Policy DefaultPolicy; + class InterProceduralData; + std::unique_ptr IPData; + enum VisitFlag { NotVisited = 0, Visited = 1, Pending = 2 }; llvm::DenseMap VisitedFD; @@ -88,6 +92,7 @@ private: public: AnalysisBasedWarnings(Sema &s); + ~AnalysisBasedWarnings(); void IssueWarnings(Policy P, FunctionScopeInfo *fscope, const Decl *D, QualType BlockType); @@ -97,6 +102,7 @@ public: void PrintStats() const; }; -}} // end namespace clang::sema +} // namespace sema +} // namespace clang #endif diff --git a/clang/lib/Analysis/CalledOnceCheck.cpp b/clang/lib/Analysis/CalledOnceCheck.cpp index d24e0b500564..29021b0a9016 100644 --- a/clang/lib/Analysis/CalledOnceCheck.cpp +++ b/clang/lib/Analysis/CalledOnceCheck.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "clang/Analysis/Analyses/CalledOnceCheck.h" +#include "clang/AST/ASTContext.h" #include "clang/AST/Attr.h" #include "clang/AST/Decl.h" #include "clang/AST/DeclBase.h" @@ -57,6 +58,20 @@ constexpr llvm::StringLiteral CONVENTIONAL_SUFFIXES[] = { constexpr llvm::StringLiteral CONVENTIONAL_CONDITIONS[] = { "error", "cancel", "shouldCall", "done", "OK", "success"}; +struct KnownCalledOnceParameter { + llvm::StringLiteral FunctionName; + unsigned ParamIndex; +}; +constexpr KnownCalledOnceParameter KNOWN_CALLED_ONCE_PARAMETERS[] = { + {"dispatch_async", 1}, + {"dispatch_async_and_wait", 1}, + {"dispatch_after", 2}, + {"dispatch_sync", 1}, + {"dispatch_once", 1}, + {"dispatch_barrier_async", 1}, + {"dispatch_barrier_async_and_wait", 1}, + {"dispatch_barrier_sync", 1}}; + class ParameterStatus { public: // Status kind is basically the main part of parameter's status. @@ -929,9 +944,9 @@ private: "Block should have at least two successors at this point"); if (auto Clarification = NotCalledClarifier::clarify(Parent, Succ)) { const ParmVarDecl *Parameter = getParameter(Index); - Handler.handleNeverCalled(Parameter, Clarification->Location, - Clarification->Reason, !IsEscape, - !isExplicitlyMarked(Parameter)); + Handler.handleNeverCalled( + Parameter, AC.getDecl(), Clarification->Location, + Clarification->Reason, !IsEscape, !isExplicitlyMarked(Parameter)); } } } @@ -1091,6 +1106,91 @@ private: return false; } + // Return a call site where the block is called exactly once or null otherwise + const Expr *getBlockGuaraneedCallSite(const BlockExpr *Block) const { + ParentMap &PM = AC.getParentMap(); + + // We don't want to track the block through assignments and so on, instead + // we simply see how the block used and if it's used directly in a call, + // we decide based on call to what it is. + // + // In order to do this, we go up the parents of the block looking for + // a call or a message expressions. These might not be immediate parents + // of the actual block expression due to casts and parens, so we skip them. + for (const Stmt *Prev = Block, *Current = PM.getParent(Block); + Current != nullptr; Prev = Current, Current = PM.getParent(Current)) { + // Skip no-op (for our case) operations. + if (isa(Current) || isa(Current)) + continue; + + // At this point, Prev represents our block as an immediate child of the + // call. + if (const auto *Call = dyn_cast(Current)) { + // It might be the call of the Block itself... + if (Call->getCallee() == Prev) + return Call; + + // ...or it can be an indirect call of the block. + return shouldBlockArgumentBeCalledOnce(Call, Prev) ? Call : nullptr; + } + if (const auto *Message = dyn_cast(Current)) { + return shouldBlockArgumentBeCalledOnce(Message, Prev) ? Message + : nullptr; + } + + break; + } + + return nullptr; + } + + template + bool shouldBlockArgumentBeCalledOnce(const CallLikeExpr *CallOrMessage, + const Stmt *BlockArgument) const { + // CallExpr::arguments does not interact nicely with llvm::enumerate. + llvm::ArrayRef Arguments = llvm::makeArrayRef( + CallOrMessage->getArgs(), CallOrMessage->getNumArgs()); + + for (const auto &Argument : llvm::enumerate(Arguments)) { + if (Argument.value() == BlockArgument) { + return shouldBlockArgumentBeCalledOnce(CallOrMessage, Argument.index()); + } + } + + return false; + } + + bool shouldBlockArgumentBeCalledOnce(const CallExpr *Call, + unsigned ParamIndex) const { + const FunctionDecl *Function = Call->getDirectCallee(); + return shouldBlockArgumentBeCalledOnce(Function, ParamIndex) || + shouldBeCalledOnce(Call, ParamIndex); + } + + bool shouldBlockArgumentBeCalledOnce(const ObjCMessageExpr *Message, + unsigned ParamIndex) const { + // At the moment, we don't have any Obj-C methods we want to specifically + // check in here. + return shouldBeCalledOnce(Message, ParamIndex); + } + + static bool shouldBlockArgumentBeCalledOnce(const FunctionDecl *Function, + unsigned ParamIndex) { + // There is a list of important API functions that while not following + // conventions nor being directly annotated, still guarantee that the + // callback parameter will be called exactly once. + // + // Here we check if this is the case. + return Function && + llvm::any_of(KNOWN_CALLED_ONCE_PARAMETERS, + [Function, ParamIndex]( + const KnownCalledOnceParameter &Reference) { + return Reference.FunctionName == + Function->getName() && + Reference.ParamIndex == ParamIndex; + }); + } + /// Return true if the analyzed function is actually a default implementation /// of the method that has to be overriden. /// @@ -1437,17 +1537,44 @@ public: } void VisitBlockExpr(const BlockExpr *Block) { + // Block expressions are tricky. It is a very common practice to capture + // completion handlers by blocks and use them there. + // For this reason, it is important to analyze blocks and report warnings + // for completion handler misuse in blocks. + // + // However, it can be quite difficult to track how the block itself is being + // used. The full precise anlysis of that will be similar to alias analysis + // for completion handlers and can be too heavyweight for a compile-time + // diagnostic. Instead, we judge about the immediate use of the block. + // + // Here, we try to find a call expression where we know due to conventions, + // annotations, or other reasons that the block is called once and only + // once. + const Expr *CalledOnceCallSite = getBlockGuaraneedCallSite(Block); + + // We need to report this information to the handler because in the + // situation when we know that the block is called exactly once, we can be + // stricter in terms of reported diagnostics. + if (CalledOnceCallSite) { + Handler.handleBlockThatIsGuaranteedToBeCalledOnce(Block->getBlockDecl()); + } else { + Handler.handleBlockWithNoGuarantees(Block->getBlockDecl()); + } + for (const auto &Capture : Block->getBlockDecl()->captures()) { - // If a block captures a tracked parameter, it should be - // considered escaped. - // On one hand, blocks that do that should definitely call it on - // every path. However, it is not guaranteed that the block - // itself gets called whenever it gets created. - // - // Because we don't want to track blocks and whether they get called, - // we consider such parameters simply escaped. if (const auto *Param = dyn_cast(Capture.getVariable())) { - checkEscapee(*Param); + if (auto Index = getIndex(*Param)) { + if (CalledOnceCallSite) { + // The call site of a block can be considered a call site of the + // captured parameter we track. + processCallFor(*Index, CalledOnceCallSite); + } else { + // We still should consider this block as an escape for parameter, + // if we don't know about its call site or the number of time it + // can be invoked. + processEscapeFor(*Index); + } + } } } } diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp index edd9742ed207..bcd6a00d7ba5 100644 --- a/clang/lib/Sema/AnalysisBasedWarnings.cpp +++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp @@ -1506,6 +1506,25 @@ static void diagnoseRepeatedUseOfWeak(Sema &S, } } +namespace clang { +namespace { +typedef SmallVector OptionalNotes; +typedef std::pair DelayedDiag; +typedef std::list DiagList; + +struct SortDiagBySourceLocation { + SourceManager &SM; + SortDiagBySourceLocation(SourceManager &SM) : SM(SM) {} + + bool operator()(const DelayedDiag &left, const DelayedDiag &right) { + // Although this call will be slow, this is only called when outputting + // multiple warnings. + return SM.isBeforeInTranslationUnit(left.first.first, right.first.first); + } +}; +} // anonymous namespace +} // namespace clang + namespace { class UninitValsDiagReporter : public UninitVariablesHandler { Sema &S; @@ -1626,9 +1645,35 @@ private: } }; +/// Inter-procedural data for the called-once checker. +class CalledOnceInterProceduralData { +public: + // Add the delayed warning for the given block. + void addDelayedWarning(const BlockDecl *Block, + PartialDiagnosticAt &&Warning) { + DelayedBlockWarnings[Block].emplace_back(std::move(Warning)); + } + // Report all of the warnings we've gathered for the given block. + void flushWarnings(const BlockDecl *Block, Sema &S) { + for (const PartialDiagnosticAt &Delayed : DelayedBlockWarnings[Block]) + S.Diag(Delayed.first, Delayed.second); + + discardWarnings(Block); + } + // Discard all of the warnings we've gathered for the given block. + void discardWarnings(const BlockDecl *Block) { + DelayedBlockWarnings.erase(Block); + } + +private: + using DelayedDiagnostics = SmallVector; + llvm::DenseMap DelayedBlockWarnings; +}; + class CalledOnceCheckReporter : public CalledOnceCheckHandler { public: - CalledOnceCheckReporter(Sema &S) : S(S) {} + CalledOnceCheckReporter(Sema &S, CalledOnceInterProceduralData &Data) + : S(S), Data(Data) {} void handleDoubleCall(const ParmVarDecl *Parameter, const Expr *Call, const Expr *PrevCall, bool IsCompletionHandler, bool Poised) override { @@ -1649,14 +1694,24 @@ public: << Parameter << /* Captured */ false; } - void handleNeverCalled(const ParmVarDecl *Parameter, const Stmt *Where, - NeverCalledReason Reason, bool IsCalledDirectly, + void handleNeverCalled(const ParmVarDecl *Parameter, const Decl *Function, + const Stmt *Where, NeverCalledReason Reason, + bool IsCalledDirectly, bool IsCompletionHandler) override { auto DiagToReport = IsCompletionHandler ? diag::warn_completion_handler_never_called_when : diag::warn_called_once_never_called_when; - S.Diag(Where->getBeginLoc(), DiagToReport) - << Parameter << IsCalledDirectly << (unsigned)Reason; + PartialDiagnosticAt Warning(Where->getBeginLoc(), S.PDiag(DiagToReport) + << Parameter + << IsCalledDirectly + << (unsigned)Reason); + + if (const auto *Block = dyn_cast(Function)) { + // We shouldn't report these warnings on blocks immediately + Data.addDelayedWarning(Block, std::move(Warning)); + } else { + S.Diag(Warning.first, Warning.second); + } } void handleCapturedNeverCalled(const ParmVarDecl *Parameter, @@ -1669,8 +1724,18 @@ public: << Parameter << /* Captured */ true; } + void + handleBlockThatIsGuaranteedToBeCalledOnce(const BlockDecl *Block) override { + Data.flushWarnings(Block, S); + } + + void handleBlockWithNoGuarantees(const BlockDecl *Block) override { + Data.discardWarnings(Block); + } + private: Sema &S; + CalledOnceInterProceduralData &Data; }; constexpr unsigned CalledOnceWarnings[] = { @@ -1703,25 +1768,6 @@ bool shouldAnalyzeCalledOnceParameters(const DiagnosticsEngine &Diags, } } // anonymous namespace -namespace clang { -namespace { -typedef SmallVector OptionalNotes; -typedef std::pair DelayedDiag; -typedef std::list DiagList; - -struct SortDiagBySourceLocation { - SourceManager &SM; - SortDiagBySourceLocation(SourceManager &SM) : SM(SM) {} - - bool operator()(const DelayedDiag &left, const DelayedDiag &right) { - // Although this call will be slow, this is only called when outputting - // multiple warnings. - return SM.isBeforeInTranslationUnit(left.first.first, right.first.first); - } -}; -} // anonymous namespace -} // namespace clang - //===----------------------------------------------------------------------===// // -Wthread-safety //===----------------------------------------------------------------------===// @@ -2107,54 +2153,68 @@ public: // warnings on a function, method, or block. //===----------------------------------------------------------------------===// -clang::sema::AnalysisBasedWarnings::Policy::Policy() { +sema::AnalysisBasedWarnings::Policy::Policy() { enableCheckFallThrough = 1; enableCheckUnreachable = 0; enableThreadSafetyAnalysis = 0; enableConsumedAnalysis = 0; } +/// InterProceduralData aims to be a storage of whatever data should be passed +/// between analyses of different functions. +/// +/// At the moment, its primary goal is to make the information gathered during +/// the analysis of the blocks available during the analysis of the enclosing +/// function. This is important due to the fact that blocks are analyzed before +/// the enclosed function is even parsed fully, so it is not viable to access +/// anything in the outer scope while analyzing the block. On the other hand, +/// re-building CFG for blocks and re-analyzing them when we do have all the +/// information (i.e. during the analysis of the enclosing function) seems to be +/// ill-designed. +class sema::AnalysisBasedWarnings::InterProceduralData { +public: + // It is important to analyze blocks within functions because it's a very + // common pattern to capture completion handler parameters by blocks. + CalledOnceInterProceduralData CalledOnceData; +}; + static unsigned isEnabled(DiagnosticsEngine &D, unsigned diag) { return (unsigned)!D.isIgnored(diag, SourceLocation()); } -clang::sema::AnalysisBasedWarnings::AnalysisBasedWarnings(Sema &s) - : S(s), - NumFunctionsAnalyzed(0), - NumFunctionsWithBadCFGs(0), - NumCFGBlocks(0), - MaxCFGBlocksPerFunction(0), - NumUninitAnalysisFunctions(0), - NumUninitAnalysisVariables(0), - MaxUninitAnalysisVariablesPerFunction(0), - NumUninitAnalysisBlockVisits(0), - MaxUninitAnalysisBlockVisitsPerFunction(0) { +sema::AnalysisBasedWarnings::AnalysisBasedWarnings(Sema &s) + : S(s), IPData(std::make_unique()), + NumFunctionsAnalyzed(0), NumFunctionsWithBadCFGs(0), NumCFGBlocks(0), + MaxCFGBlocksPerFunction(0), NumUninitAnalysisFunctions(0), + NumUninitAnalysisVariables(0), MaxUninitAnalysisVariablesPerFunction(0), + NumUninitAnalysisBlockVisits(0), + MaxUninitAnalysisBlockVisitsPerFunction(0) { using namespace diag; DiagnosticsEngine &D = S.getDiagnostics(); DefaultPolicy.enableCheckUnreachable = - isEnabled(D, warn_unreachable) || - isEnabled(D, warn_unreachable_break) || - isEnabled(D, warn_unreachable_return) || - isEnabled(D, warn_unreachable_loop_increment); + isEnabled(D, warn_unreachable) || isEnabled(D, warn_unreachable_break) || + isEnabled(D, warn_unreachable_return) || + isEnabled(D, warn_unreachable_loop_increment); - DefaultPolicy.enableThreadSafetyAnalysis = - isEnabled(D, warn_double_lock); + DefaultPolicy.enableThreadSafetyAnalysis = isEnabled(D, warn_double_lock); DefaultPolicy.enableConsumedAnalysis = - isEnabled(D, warn_use_in_invalid_state); + isEnabled(D, warn_use_in_invalid_state); } +// We need this here for unique_ptr with forward declared class. +sema::AnalysisBasedWarnings::~AnalysisBasedWarnings() = default; + static void flushDiagnostics(Sema &S, const sema::FunctionScopeInfo *fscope) { for (const auto &D : fscope->PossiblyUnreachableDiags) S.Diag(D.Loc, D.PD); } -void clang::sema:: -AnalysisBasedWarnings::IssueWarnings(sema::AnalysisBasedWarnings::Policy P, - sema::FunctionScopeInfo *fscope, - const Decl *D, QualType BlockType) { +void clang::sema::AnalysisBasedWarnings::IssueWarnings( + sema::AnalysisBasedWarnings::Policy P, sema::FunctionScopeInfo *fscope, + const Decl *D, QualType BlockType) { // We avoid doing analysis-based warnings when there are errors for // two reasons: @@ -2346,7 +2406,7 @@ AnalysisBasedWarnings::IssueWarnings(sema::AnalysisBasedWarnings::Policy P, if (S.getLangOpts().ObjC && shouldAnalyzeCalledOnceParameters(Diags, D->getBeginLoc())) { if (AC.getCFG()) { - CalledOnceCheckReporter Reporter(S); + CalledOnceCheckReporter Reporter(S, IPData->CalledOnceData); checkCalledOnceParameters( AC, Reporter, shouldAnalyzeCalledOnceConventions(Diags, D->getBeginLoc())); diff --git a/clang/test/SemaObjC/warn-called-once.m b/clang/test/SemaObjC/warn-called-once.m index 7d0679035238..825d491f53bb 100644 --- a/clang/test/SemaObjC/warn-called-once.m +++ b/clang/test/SemaObjC/warn-called-once.m @@ -31,6 +31,16 @@ typedef struct { @class NSString, Protocol; extern void NSLog(NSString *format, ...); +typedef int group_t; +typedef struct dispatch_queue_s *dispatch_queue_t; +typedef void (^dispatch_block_t)(void); +extern dispatch_queue_t queue; + +void dispatch_group_async(dispatch_queue_t queue, + group_t group, + dispatch_block_t block); +void dispatch_async(dispatch_queue_t queue, dispatch_block_t block); + void escape(void (^callback)(void)); void escape_void(void *); void indirect_call(void (^callback)(void) CALLED_ONCE); @@ -225,11 +235,11 @@ void indirect_call_within_direct_call(void (^callback)(void) CALLED_ONCE, } void block_call_1(void (^callback)(void) CALLED_ONCE) { - indirect_call(^{ - callback(); - }); - callback(); - // no-warning + indirect_call( // expected-note{{previous call is here}} + ^{ + callback(); + }); + callback(); // expected-warning{{'callback' parameter marked 'called_once' is called twice}} } void block_call_2(void (^callback)(void) CALLED_ONCE) { @@ -255,7 +265,7 @@ void block_call_4(int cond, void (^callback)(void) CALLED_ONCE) { // expected-warning@-1{{'callback' parameter marked 'called_once' is never used when taking false branch}} escape(callback); } - }(); + }(); // no-warning } void block_call_5(void (^outer)(void) CALLED_ONCE) { @@ -273,6 +283,32 @@ void block_with_called_once(void (^outer)(void) CALLED_ONCE) { outer(); // expected-warning{{'outer' parameter marked 'called_once' is called twice}} } +void block_dispatch_call(int cond, void (^callback)(void) CALLED_ONCE) { + dispatch_async(queue, ^{ + if (cond) // expected-warning{{'callback' parameter marked 'called_once' is never called when taking false branch}} + callback(); + }); +} + +void block_escape_call_1(int cond, void (^callback)(void) CALLED_ONCE) { + escape_void((__bridge void *)^{ + if (cond) { + // no-warning + callback(); + } + }); +} + +void block_escape_call_2(int cond, void (^callback)(void) CALLED_ONCE) { + escape_void((__bridge void *)^{ + if (cond) { + callback(); // expected-note{{previous call is here}} + } + // Double call can still be reported. + callback(); // expected-warning{{'callback' parameter marked 'called_once' is called twice}} + }); +} + void never_called_one_exit(int cond, void (^callback)(void) CALLED_ONCE) { if (!cond) // expected-warning{{'callback' parameter marked 'called_once' is never called when taking true branch}} return; @@ -822,11 +858,10 @@ void suppression_3(int cond, void (^callback)(void) CALLED_ONCE) { - (void)block_call_1:(void (^)(void))CALLED_ONCE callback { // We consider captures by blocks as escapes - [self indirect_call:(^{ + [self indirect_call:(^{ // expected-note{{previous call is here}} callback(); })]; - callback(); - // no-warning + callback(); // expected-warning{{'callback' parameter marked 'called_once' is called twice}} } - (void)block_call_2:(int)cond callback:(void (^)(void))CALLED_ONCE callback { -- GitLab From 0002d4bf3624ef42c372c30baa32504fe25a4103 Mon Sep 17 00:00:00 2001 From: Bing1 Yu Date: Thu, 18 Mar 2021 17:07:49 +0800 Subject: [PATCH 0010/1000] [X86][AMX][NFC] Give correct Passname for Tile Register Pre-configure --- llvm/lib/Target/X86/X86PreTileConfig.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 1c91e87e69d5..dd35a5d1c057 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -1,4 +1,4 @@ -//===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===// +//===-- X86PreTileConfig.cpp - Tile Register Pre-configure-----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -87,10 +87,10 @@ public: char X86PreTileConfig::ID = 0; INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", - "Tile Register Configure", false, false) + "Tile Register Pre-configure", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", - "Tile Register Configure", false, false) + "Tile Register Pre-configure", false, false) void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); -- GitLab From 209a626ede412cc37b87896579a4ee24af82aa7d Mon Sep 17 00:00:00 2001 From: "Wang, Pengfei" Date: Thu, 18 Mar 2021 17:01:06 +0800 Subject: [PATCH 0011/1000] [X86][NFC] Pre-commit test case for the fix of ldtilecfg insertion. --- llvm/test/CodeGen/X86/AMX/amx-across-func.ll | 94 ++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll index b687d03f92ba..2bb73e26c431 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -263,6 +263,100 @@ define dso_local i32 @test_loop(i32 %0) nounwind { ret i32 %20 } +define dso_local void @test_loop2(i32 %0) nounwind { +; CHECK-LABEL: test_loop2: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: subq $3024, %rsp # imm = 0xBD0 +; CHECK-NEXT: movl %edi, %ebx +; CHECK-NEXT: movl $buf, %r14d +; CHECK-NEXT: movl $32, %r15d +; CHECK-NEXT: movw $8, %bp +; CHECK-NEXT: movl $buf+2048, %r12d +; CHECK-NEXT: .p2align 4, 0x90 +; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: callq foo +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: testl %ebx, %ebx +; CHECK-NEXT: jle .LBB3_3 +; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tilestored %tmm0, 1024(%rsp,%rax) # 1024-byte Folded Spill +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; CHECK-NEXT: movabsq $64, %rax +; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm0 # 1024-byte Folded Reload +; CHECK-NEXT: tilestored %tmm0, (%r12,%r15) +; CHECK-NEXT: callq foo +; CHECK-NEXT: jmp .LBB3_1 +; CHECK-NEXT: .LBB3_3: +; CHECK-NEXT: addq $3024, %rsp # imm = 0xBD0 +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: tilerelease +; CHECK-NEXT: retq +; +; IPRA-LABEL: test_loop2: +; IPRA: # %bb.0: +; IPRA-NEXT: subq $72, %rsp +; IPRA-NEXT: movl $buf, %eax +; IPRA-NEXT: movl $32, %ecx +; IPRA-NEXT: movw $8, %dx +; IPRA-NEXT: movl $buf+2048, %esi +; IPRA-NEXT: .p2align 4, 0x90 +; IPRA-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; IPRA-NEXT: callq foo +; IPRA-NEXT: testl %edi, %edi +; IPRA-NEXT: jle .LBB3_3 +; IPRA-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; IPRA-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; IPRA-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $1, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movb $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: movw $8, {{[0-9]+}}(%rsp) +; IPRA-NEXT: ldtilecfg {{[0-9]+}}(%rsp) +; IPRA-NEXT: tileloadd (%rax,%rcx), %tmm0 +; IPRA-NEXT: callq foo +; IPRA-NEXT: tilestored %tmm0, (%rsi,%rcx) +; IPRA-NEXT: callq foo +; IPRA-NEXT: jmp .LBB3_1 +; IPRA-NEXT: .LBB3_3: +; IPRA-NEXT: addq $72, %rsp +; IPRA-NEXT: tilerelease +; IPRA-NEXT: vzeroupper +; IPRA-NEXT: retq + br label %2 +2: + %3 = phi i32 [ 0, %1 ], [ %7, %5 ] + call void @foo() + %4 = icmp sgt i32 %0, 0 + br i1 %4, label %5, label %8 +5: + %6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32) + call void @foo() + tail call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6) + call void @foo() + %7 = add i32 %3, 1 + br label %2 +8: + ret void +} + declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx) -- GitLab From 02834e1bd94602bb3d1c603fd9fb874eb0e75290 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Wed, 3 Mar 2021 12:04:08 +0300 Subject: [PATCH 0012/1000] [mlir][ODS] Get rid of limitations in rewriters generator Do not limit the number of arguments in rewriter pattern. Introduce separate `FmtStrVecObject` class to handle format of variadic `std::string` array. Reviewed By: mehdi_amini Differential Revision: https://reviews.llvm.org/D97839 --- mlir/include/mlir/TableGen/Format.h | 19 ++++++++++ mlir/lib/TableGen/Format.cpp | 19 ++++++++++ mlir/test/mlir-tblgen/rewriter-indexing.td | 27 +++++++++++++++ mlir/tools/mlir-tblgen/RewriterGen.cpp | 40 +++++++++------------- 4 files changed, 81 insertions(+), 24 deletions(-) diff --git a/mlir/include/mlir/TableGen/Format.h b/mlir/include/mlir/TableGen/Format.h index 18a7a6f985b8..441e05c29f26 100644 --- a/mlir/include/mlir/TableGen/Format.h +++ b/mlir/include/mlir/TableGen/Format.h @@ -186,6 +186,20 @@ public: } }; +class FmtStrVecObject : public FmtObjectBase { +public: + using StrFormatAdapter = + decltype(llvm::detail::build_format_adapter(std::declval())); + + FmtStrVecObject(StringRef fmt, const FmtContext *ctx, + ArrayRef params); + FmtStrVecObject(FmtStrVecObject const &that) = delete; + FmtStrVecObject(FmtStrVecObject &&that); + +private: + SmallVector parameters; +}; + /// Formats text by substituting placeholders in format string with replacement /// parameters. /// @@ -234,6 +248,11 @@ inline auto tgfmt(StringRef fmt, const FmtContext *ctx, Ts &&... vals) llvm::detail::build_format_adapter(std::forward(vals))...)); } +inline FmtStrVecObject tgfmt(StringRef fmt, const FmtContext *ctx, + ArrayRef params) { + return FmtStrVecObject(fmt, ctx, params); +} + } // end namespace tblgen } // end namespace mlir diff --git a/mlir/lib/TableGen/Format.cpp b/mlir/lib/TableGen/Format.cpp index 7d17a0aef3f9..10834510b767 100644 --- a/mlir/lib/TableGen/Format.cpp +++ b/mlir/lib/TableGen/Format.cpp @@ -173,3 +173,22 @@ void FmtObjectBase::format(raw_ostream &s) const { adapters[repl.index]->format(s, /*Options=*/""); } } + +FmtStrVecObject::FmtStrVecObject(StringRef fmt, const FmtContext *ctx, + ArrayRef params) + : FmtObjectBase(fmt, ctx, params.size()) { + parameters.reserve(params.size()); + for (std::string p : params) + parameters.push_back(llvm::detail::build_format_adapter(std::move(p))); + + adapters.reserve(parameters.size()); + for (auto &p : parameters) + adapters.push_back(&p); +} + +FmtStrVecObject::FmtStrVecObject(FmtStrVecObject &&that) + : FmtObjectBase(std::move(that)), parameters(std::move(that.parameters)) { + adapters.reserve(parameters.size()); + for (auto &p : parameters) + adapters.push_back(&p); +} diff --git a/mlir/test/mlir-tblgen/rewriter-indexing.td b/mlir/test/mlir-tblgen/rewriter-indexing.td index a6b403285765..cbdeff9c743d 100644 --- a/mlir/test/mlir-tblgen/rewriter-indexing.td +++ b/mlir/test/mlir-tblgen/rewriter-indexing.td @@ -58,3 +58,30 @@ def test2 : Pat<(COp $attr1, $op1, $attr2, (AOp $op2)), def test3 : Pat<(BOp $attr, (AOp:$a $input)), (BOp $attr, (AOp $input), (location $a))>; +def DOp : NS_Op<"d_op", []> { + let arguments = (ins + AnyInteger:$v1, + AnyInteger:$v2, + AnyInteger:$v3, + AnyInteger:$v4, + AnyInteger:$v5, + AnyInteger:$v6, + AnyInteger:$v7, + AnyInteger:$v8, + AnyInteger:$v9, + AnyInteger:$v10 + ); + + let results = (outs AnyInteger); +} + +def NativeBuilder : + NativeCodeCall<[{ + nativeCall($_builder, $_loc, $0, $1, $2, $3, $4, $5, $6, $7, $8, $9) + }]>; + +// Check Pattern with large number of DAG arguments passed to NativeCodeCall +// CHECK: struct test4 : public ::mlir::RewritePattern { +// CHECK: nativeCall(rewriter, odsLoc, (*v1.begin()), (*v2.begin()), (*v3.begin()), (*v4.begin()), (*v5.begin()), (*v6.begin()), (*v7.begin()), (*v8.begin()), (*v9.begin()), (*v10.begin())) +def test4 : Pat<(DOp $v1, $v2, $v3, $v4, $v5, $v6, $v7, $v8, $v9, $v10), + (NativeBuilder $v1, $v2, $v3, $v4, $v5, $v6, $v7, $v8, $v9, $v10)>; diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp index 5781870e0df7..7ee05f2114a6 100644 --- a/mlir/tools/mlir-tblgen/RewriterGen.cpp +++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp @@ -251,12 +251,8 @@ void PatternEmitter::emitNativeCodeMatch(DagNode tree, StringRef opName, // TODO(suderman): iterate through arguments, determine their types, output // names. - SmallVector capture(8); - if (tree.getNumArgs() > 8) { - PrintFatalError(loc, - "unsupported NativeCodeCall matcher argument numbers: " + - Twine(tree.getNumArgs())); - } + SmallVector capture; + capture.push_back(opName.str()); raw_indented_ostream::DelimitedScope scope(os); @@ -274,7 +270,7 @@ void PatternEmitter::emitNativeCodeMatch(DagNode tree, StringRef opName, } } - capture[i] = std::move(argName); + capture.push_back(std::move(argName)); } bool hasLocationDirective; @@ -282,21 +278,20 @@ void PatternEmitter::emitNativeCodeMatch(DagNode tree, StringRef opName, std::tie(hasLocationDirective, locToUse) = getLocation(tree); auto fmt = tree.getNativeCodeTemplate(); - auto nativeCodeCall = std::string(tgfmt( - fmt, &fmtCtx.addSubst("_loc", locToUse), opName, capture[0], capture[1], - capture[2], capture[3], capture[4], capture[5], capture[6], capture[7])); + auto nativeCodeCall = + std::string(tgfmt(fmt, &fmtCtx.addSubst("_loc", locToUse), capture)); os << "if (failed(" << nativeCodeCall << ")) return ::mlir::failure();\n"; for (int i = 0, e = tree.getNumArgs(); i != e; ++i) { auto name = tree.getArgName(i); if (!name.empty() && name != "_") { - os << formatv("{0} = {1};\n", name, capture[i]); + os << formatv("{0} = {1};\n", name, capture[i + 1]); } } for (int i = 0, e = tree.getNumArgs(); i != e; ++i) { - std::string argName = capture[i]; + std::string argName = capture[i + 1]; // Handle nested DAG construct first if (DagNode argTree = tree.getArgAsNestedDag(i)) { @@ -915,29 +910,26 @@ std::string PatternEmitter::handleReplaceWithNativeCodeCall(DagNode tree, LLVM_DEBUG(llvm::dbgs() << '\n'); auto fmt = tree.getNativeCodeTemplate(); - // TODO: replace formatv arguments with the exact specified args. - SmallVector attrs(8); - if (tree.getNumArgs() > 8) { - PrintFatalError(loc, - "unsupported NativeCodeCall replace argument numbers: " + - Twine(tree.getNumArgs())); - } + + SmallVector attrs; + bool hasLocationDirective; std::string locToUse; std::tie(hasLocationDirective, locToUse) = getLocation(tree); for (int i = 0, e = tree.getNumArgs() - hasLocationDirective; i != e; ++i) { if (tree.isNestedDagArg(i)) { - attrs[i] = handleResultPattern(tree.getArgAsNestedDag(i), i, depth + 1); + attrs.push_back( + handleResultPattern(tree.getArgAsNestedDag(i), i, depth + 1)); } else { - attrs[i] = handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i)); + attrs.push_back( + handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i))); } LLVM_DEBUG(llvm::dbgs() << "NativeCodeCall argument #" << i << " replacement: " << attrs[i] << "\n"); } - return std::string(tgfmt(fmt, &fmtCtx.addSubst("_loc", locToUse), attrs[0], - attrs[1], attrs[2], attrs[3], attrs[4], attrs[5], - attrs[6], attrs[7])); + + return std::string(tgfmt(fmt, &fmtCtx.addSubst("_loc", locToUse), attrs)); } int PatternEmitter::getNodeValueCount(DagNode node) { -- GitLab From 4a7afc9a8843f4793296a260f7153fd2ef4ec497 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Thu, 11 Mar 2021 14:22:47 +0300 Subject: [PATCH 0013/1000] [-Wcalled-once-parameter] Fix false positives for cleanup attr Cleanup attribute allows users to attach a destructor-like functions to variable declarations to be called whenever they leave the scope. The logic of such functions is not supported by the Clang's CFG and is too hard to be reasoned about. In order to avoid false positives in this situation, we assume that we didn't see ALL of the executtion paths of the function and, thus, can warn only about multiple call violation. rdar://74441906 Differential Revision: https://reviews.llvm.org/D98694 --- clang/lib/Analysis/CalledOnceCheck.cpp | 19 ++++++++++-- clang/test/SemaObjC/warn-called-once.m | 42 ++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/clang/lib/Analysis/CalledOnceCheck.cpp b/clang/lib/Analysis/CalledOnceCheck.cpp index 29021b0a9016..ab56d3e3c988 100644 --- a/clang/lib/Analysis/CalledOnceCheck.cpp +++ b/clang/lib/Analysis/CalledOnceCheck.cpp @@ -812,8 +812,12 @@ private: } } - // Early exit if we don't have parameters for extra analysis. - if (NotCalledOnEveryPath.none() && NotUsedOnEveryPath.none()) + // Early exit if we don't have parameters for extra analysis... + if (NotCalledOnEveryPath.none() && NotUsedOnEveryPath.none() && + // ... or if we've seen variables with cleanup functions. + // We can't reason that we've seen every path in this case, + // and thus abandon reporting any warnings that imply that. + !FunctionHasCleanupVars) return; // We are looking for a pair of blocks A, B so that the following is true: @@ -1601,6 +1605,10 @@ public: if (Var->getInit()) { checkEscapee(Var->getInit()); } + + if (Var->hasAttr()) { + FunctionHasCleanupVars = true; + } } } } @@ -1669,6 +1677,13 @@ private: // around. bool SuppressOnConventionalErrorPaths = false; + // The user can annotate variable declarations with cleanup functions, which + // essentially imposes a custom destructor logic on that variable. + // It is possible to use it, however, to call tracked parameters on all exits + // from the function. For this reason, we track the fact that the function + // actually has these. + bool FunctionHasCleanupVars = false; + State CurrentState; ParamSizedVector TrackedParams; CFGSizedVector States; diff --git a/clang/test/SemaObjC/warn-called-once.m b/clang/test/SemaObjC/warn-called-once.m index 825d491f53bb..ff2778d4bd0a 100644 --- a/clang/test/SemaObjC/warn-called-once.m +++ b/clang/test/SemaObjC/warn-called-once.m @@ -1193,4 +1193,46 @@ void suppression_3(int cond, void (^callback)(void) CALLED_ONCE) { escape(handler); } +// rdar://74441906 +typedef void (^DeferredBlock)(void); +static inline void DefferedCallback(DeferredBlock *inBlock) { (*inBlock)(); } +#define _DEFERCONCAT(a, b) a##b +#define _DEFERNAME(a) _DEFERCONCAT(__DeferredVar_, a) +#define DEFER __extension__ __attribute__((cleanup(DefferedCallback), unused)) \ + DeferredBlock _DEFERNAME(__COUNTER__) = ^ + +- (void)test_cleanup_1:(int)cond + withCompletion:(void (^)(void))handler { + int error = 0; + DEFER { + if (error) + handler(); + }; + + if (cond) { + error = 1; + } else { + // no-warning + handler(); + } +} + +- (void)test_cleanup_2:(int)cond + withCompletion:(void (^)(void))handler { + int error = 0; + DEFER { + if (error) + handler(); + }; + + if (cond) { + error = 1; + } else { + handler(); // expected-note{{previous call is here}} + } + + // We still can warn about double call even in this case. + handler(); // expected-warning{{completion handler is called twice}} +} + @end -- GitLab From 0331399dc9346f3c5acdf784ddb96567efc9d538 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Thu, 4 Feb 2021 09:56:01 +0000 Subject: [PATCH 0014/1000] [RISCV] Support scalable-vector masked gather operations This patch supports the masked gather intrinsics in RVV. The RVV indexed load/store instructions only support the "unsigned unscaled" addressing mode; indices are implicitly zero-extended or truncated to XLEN and are treated as byte offsets. This ISA supports the intrinsics directly, but not the majority of various forms of the MGATHER SDNode that LLVM combines to. Any signed or scaled indexing is extended to the XLEN value type and scaled accordingly. This is done during DAG combining as widening the index types to XLEN may produce illegal vectors that require splitting, e.g. nxv16i8->nxv16i64. Support for scalable-vector CONCAT_VECTORS was added to avoid spilling via the stack when lowering split legalized index operands. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D96263 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 102 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 + llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll | 2194 +++++++++++++++++ 3 files changed, 2296 insertions(+), 3 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index b54e2ce73fd1..ee686102c147 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -474,6 +474,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::MGATHER, VT, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); @@ -513,6 +516,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Custom); + + setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); @@ -686,8 +692,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.hasStdExtZbp()) { setTargetDAGCombine(ISD::OR); } - if (Subtarget.hasStdExtV()) + if (Subtarget.hasStdExtV()) { setTargetDAGCombine(ISD::FCOPYSIGN); + setTargetDAGCombine(ISD::MGATHER); + } } EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, @@ -1629,9 +1637,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, // better than going through the stack, as the default expansion does. SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); - assert(VT.isFixedLengthVector() && "Unexpected CONCAT_VECTORS lowering"); unsigned NumOpElts = - Op.getOperand(0).getSimpleValueType().getVectorNumElements(); + Op.getOperand(0).getSimpleValueType().getVectorMinNumElements(); SDValue Vec = DAG.getUNDEF(VT); for (const auto &OpIdx : enumerate(Op->ops())) Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Vec, OpIdx.value(), @@ -1711,6 +1718,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerFixedLengthVectorSelectToRVV(Op, DAG); case ISD::FCOPYSIGN: return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG); + case ISD::MGATHER: + return lowerMGATHER(Op, DAG); } } @@ -3453,6 +3462,46 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, SelectionDAG &DAG, return convertFromScalableVector(VT, ScalableRes, DAG, Subtarget); } +// Custom lower MGATHER to a legalized form for RVV. It will then be matched to +// a RVV indexed load. The RVV indexed load/store instructions only support the +// "unsigned unscaled" addressing mode; indices are implicitly zero-extended or +// truncated to XLEN and are treated as byte offsets. Any signed or scaled +// indexing is extended to the XLEN value type and scaled accordingly. +SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const { + MaskedGatherSDNode *N = cast(Op.getNode()); + SDLoc DL(Op); + MVT VT = Op.getSimpleValueType(); + SDValue Index = N->getIndex(); + SDValue Mask = N->getMask(); + SDValue PassThru = N->getPassThru(); + + MVT XLenVT = Subtarget.getXLenVT(); + assert(N->getBasePtr().getSimpleValueType() == XLenVT && + "Unexpected pointer type"); + // Targets have to explicitly opt-in for extending vector loads. + assert(N->getExtensionType() == ISD::NON_EXTLOAD && + "Unexpected extending MGATHER"); + + SDValue VL = getDefaultVLOps(VT, VT, DL, DAG, Subtarget).second; + // If the mask is known to be all ones, optimize to an unmasked intrinsic; + // the selection of the masked intrinsics doesn't do this for us. + if (ISD::isConstantSplatVectorAllOnes(Mask.getNode())) { + SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vloxei, DL, XLenVT); + SDValue Ops[] = {N->getChain(), IntID, N->getBasePtr(), Index, VL}; + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(VT, MVT::Other), Ops, + N->getMemoryVT(), N->getMemOperand()); + } + + SDValue IntID = + DAG.getTargetConstant(Intrinsic::riscv_vloxei_mask, DL, XLenVT); + SDValue Ops[] = {N->getChain(), IntID, PassThru, N->getBasePtr(), + Index, Mask, VL}; + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList(VT, MVT::Other), Ops, + N->getMemoryVT(), N->getMemOperand()); +} + // Returns the opcode of the target-specific SDNode that implements the 32-bit // form of the given Opcode. static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) { @@ -4470,6 +4519,49 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N->getOperand(0), DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound)); } + case ISD::MGATHER: { + if (!DCI.isBeforeLegalize()) + break; + MaskedGatherSDNode *MGN = cast(N); + SDValue Index = MGN->getIndex(); + EVT IndexVT = Index.getValueType(); + MVT XLenVT = Subtarget.getXLenVT(); + // RISCV indexed loads only support the "unsigned unscaled" addressing + // mode, so anything else must be manually legalized. + bool NeedsIdxLegalization = + MGN->isIndexScaled() || + (MGN->isIndexSigned() && IndexVT.getVectorElementType().bitsLT(XLenVT)); + if (!NeedsIdxLegalization) + break; + + SDLoc DL(N); + + // Any index legalization should first promote to XLenVT, so we don't lose + // bits when scaling. This may create an illegal index type so we let + // LLVM's legalization take care of the splitting. + if (IndexVT.getVectorElementType().bitsLT(XLenVT)) { + IndexVT = IndexVT.changeVectorElementType(XLenVT); + Index = DAG.getNode(MGN->isIndexSigned() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, + DL, IndexVT, Index); + } + + unsigned Scale = N->getConstantOperandVal(5); + if (MGN->isIndexScaled() && Scale != 1) { + // Manually scale the indices by the element size. + // TODO: Sanitize the scale operand here? + assert(isPowerOf2_32(Scale) && "Expecting power-of-two types"); + SDValue SplatScale = DAG.getConstant(Log2_32(Scale), DL, IndexVT); + Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index, SplatScale); + } + + ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_UNSCALED; + return DAG.getMaskedGather( + N->getVTList(), MGN->getMemoryVT(), DL, + {MGN->getChain(), MGN->getPassThru(), MGN->getMask(), MGN->getBasePtr(), + Index, MGN->getScale()}, + MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType()); + } } return SDValue(); @@ -6890,6 +6982,10 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( return Result; } +bool RISCVTargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { + return false; +} + bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index d454df95b630..1aea84dd258a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -432,6 +432,8 @@ public: static MVT getContainerForFixedLengthVector(SelectionDAG &DAG, MVT VT, const RISCVSubtarget &Subtarget); + bool shouldRemoveExtendFromGSIndex(EVT VT) const override; + private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, @@ -475,6 +477,7 @@ private: SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerMGATHER(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorMaskedLoadToRVV(SDValue Op, diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll new file mode 100644 index 000000000000..c5f9ea8aa3e3 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -0,0 +1,2194 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64 + +declare @llvm.masked.gather.nxv1i8.nxv1p0i8(, i32, , ) + +define @mgather_nxv1i8( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv1i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf8,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv1i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf8,tu,mu +; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv1i8.nxv1p0i8( %ptrs, i32 1, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv2i8.nxv2p0i8(, i32, , ) + +define @mgather_nxv2i8( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i8.nxv2p0i8( %ptrs, i32 1, %m, %passthru) + ret %v +} + +define @mgather_nxv2i8_sextload_nxv2i16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i8_sextload_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vsext.vf2 v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i8_sextload_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vsext.vf2 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i8.nxv2p0i8( %ptrs, i32 1, %m, %passthru) + %ev = sext %v to + ret %ev +} + +define @mgather_nxv2i8_zextload_nxv2i16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i8_zextload_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vzext.vf2 v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i8_zextload_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vzext.vf2 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i8.nxv2p0i8( %ptrs, i32 1, %m, %passthru) + %ev = zext %v to + ret %ev +} + +define @mgather_nxv2i8_sextload_nxv2i32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i8_sextload_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vsext.vf4 v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i8_sextload_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vsext.vf4 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i8.nxv2p0i8( %ptrs, i32 1, %m, %passthru) + %ev = sext %v to + ret %ev +} + +define @mgather_nxv2i8_zextload_nxv2i32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i8_zextload_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vzext.vf4 v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i8_zextload_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vzext.vf4 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i8.nxv2p0i8( %ptrs, i32 1, %m, %passthru) + %ev = zext %v to + ret %ev +} + +define @mgather_nxv2i8_sextload_nxv2i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i8_sextload_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vsext.vf8 v26, v9 +; RV32-NEXT: vmv2r.v v8, v26 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i8_sextload_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vsext.vf8 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i8.nxv2p0i8( %ptrs, i32 1, %m, %passthru) + %ev = sext %v to + ret %ev +} + +define @mgather_nxv2i8_zextload_nxv2i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i8_zextload_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vzext.vf8 v26, v9 +; RV32-NEXT: vmv2r.v v8, v26 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i8_zextload_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vzext.vf8 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i8.nxv2p0i8( %ptrs, i32 1, %m, %passthru) + %ev = zext %v to + ret %ev +} + +declare @llvm.masked.gather.nxv4i8.nxv4p0i8(, i32, , ) + +define @mgather_nxv4i8( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf2,tu,mu +; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf2,tu,mu +; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4i8.nxv4p0i8( %ptrs, i32 1, %m, %passthru) + ret %v +} + +define @mgather_truemask_nxv4i8( %ptrs, %passthru) { +; RV32-LABEL: mgather_truemask_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV32-NEXT: vloxei32.v v8, (zero), v8 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_truemask_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV64-NEXT: vloxei64.v v8, (zero), v8 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + %v = call @llvm.masked.gather.nxv4i8.nxv4p0i8( %ptrs, i32 1, %mtrue, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv8i8.nxv8p0i8(, i32, , ) + +define @mgather_nxv8i8( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,m1,tu,mu +; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,m1,tu,mu +; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v16 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv8i8.nxv8p0i8( %ptrs, i32 1, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i8(i8* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; RV32-NEXT: vloxei32.v v9, (a0), v28, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; RV64-NEXT: vloxei64.v v9, (a0), v16, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i8, i8* %base, %idxs + %v = call @llvm.masked.gather.nxv8i8.nxv8p0i8( %ptrs, i32 1, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv1i16.nxv1p0i16(, i32, , ) + +define @mgather_nxv1i16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv1i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv1i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf4,tu,mu +; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv1i16.nxv1p0i16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv2i16.nxv2p0i16(, i32, , ) + +define @mgather_nxv2i16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i16.nxv2p0i16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_nxv2i16_sextload_nxv2i32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i16_sextload_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vsext.vf2 v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i16_sextload_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vsext.vf2 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i16.nxv2p0i16( %ptrs, i32 2, %m, %passthru) + %ev = sext %v to + ret %ev +} + +define @mgather_nxv2i16_zextload_nxv2i32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i16_zextload_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vzext.vf2 v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i16_zextload_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vzext.vf2 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i16.nxv2p0i16( %ptrs, i32 2, %m, %passthru) + %ev = zext %v to + ret %ev +} + +define @mgather_nxv2i16_sextload_nxv2i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i16_sextload_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vsext.vf4 v26, v9 +; RV32-NEXT: vmv2r.v v8, v26 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i16_sextload_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vsext.vf4 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i16.nxv2p0i16( %ptrs, i32 2, %m, %passthru) + %ev = sext %v to + ret %ev +} + +define @mgather_nxv2i16_zextload_nxv2i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i16_zextload_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vzext.vf4 v26, v9 +; RV32-NEXT: vmv2r.v v8, v26 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i16_zextload_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vzext.vf4 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i16.nxv2p0i16( %ptrs, i32 2, %m, %passthru) + %ev = zext %v to + ret %ev +} + +declare @llvm.masked.gather.nxv4i16.nxv4p0i16(, i32, , ) + +define @mgather_nxv4i16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,tu,mu +; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,tu,mu +; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4i16.nxv4p0i16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_truemask_nxv4i16( %ptrs, %passthru) { +; RV32-LABEL: mgather_truemask_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vloxei32.v v8, (zero), v8 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_truemask_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vloxei64.v v8, (zero), v8 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + %v = call @llvm.masked.gather.nxv4i16.nxv4p0i16( %ptrs, i32 2, %mtrue, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv8i16.nxv8p0i16(, i32, , ) + +define @mgather_nxv8i16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t +; RV32-NEXT: vmv2r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vmv2r.v v8, v16 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv8i16.nxv8p0i16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i8_nxv8i16(i16* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i16, i16* %base, %idxs + %v = call @llvm.masked.gather.nxv8i16.nxv8p0i16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i8_nxv8i16(i16* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i16, i16* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i16.nxv8p0i16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i8_nxv8i16(i16* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i16, i16* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i16.nxv8p0i16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i16(i16* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i16, i16* %base, %idxs + %v = call @llvm.masked.gather.nxv8i16.nxv8p0i16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv1i32.nxv1p0i32(, i32, , ) + +define @mgather_nxv1i32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,mf2,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,mf2,tu,mu +; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv1i32.nxv1p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv2i32.nxv2p0i32(, i32, , ) + +define @mgather_nxv2i32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i32.nxv2p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_nxv2i32_sextload_nxv2i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i32_sextload_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vsext.vf2 v26, v9 +; RV32-NEXT: vmv2r.v v8, v26 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i32_sextload_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vsext.vf2 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i32.nxv2p0i32( %ptrs, i32 4, %m, %passthru) + %ev = sext %v to + ret %ev +} + +define @mgather_nxv2i32_zextload_nxv2i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i32_zextload_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vzext.vf2 v26, v9 +; RV32-NEXT: vmv2r.v v8, v26 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i32_zextload_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vzext.vf2 v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i32.nxv2p0i32( %ptrs, i32 4, %m, %passthru) + %ev = zext %v to + ret %ev +} + +declare @llvm.masked.gather.nxv4i32.nxv4p0i32(, i32, , ) + +define @mgather_nxv4i32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,tu,mu +; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t +; RV64-NEXT: vmv2r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_truemask_nxv4i32( %ptrs, %passthru) { +; RV32-LABEL: mgather_truemask_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vloxei32.v v8, (zero), v8 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_truemask_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vloxei64.v v8, (zero), v8 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + %v = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %ptrs, i32 4, %mtrue, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv8i32.nxv8p0i32(, i32, , ) + +define @mgather_nxv8i32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vmv4r.v v8, v16 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv8i32.nxv8p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i8_nxv8i32(i32* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + %v = call @llvm.masked.gather.nxv8i32.nxv8p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i8_nxv8i32(i32* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i32.nxv8p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i8_nxv8i32(i32* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i32.nxv8p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i16_nxv8i32(i32* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + %v = call @llvm.masked.gather.nxv8i32.nxv8p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i16_nxv8i32(i32* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i32.nxv8p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i16_nxv8i32(i32* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i32.nxv8p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i32(i32* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v8, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + %v = call @llvm.masked.gather.nxv8i32.nxv8p0i32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv1i64.nxv1p0i64(, i32, , ) + +define @mgather_nxv1i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,tu,mu +; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv1i64.nxv1p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv2i64.nxv2p0i64(, i32, , ) + +define @mgather_nxv2i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2i64.nxv2p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv4i64.nxv4p0i64(, i32, , ) + +define @mgather_nxv4i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4i64.nxv4p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_truemask_nxv4i64( %ptrs, %passthru) { +; RV32-LABEL: mgather_truemask_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vloxei32.v v8, (zero), v8 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_truemask_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vloxei64.v v8, (zero), v8 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + %v = call @llvm.masked.gather.nxv4i64.nxv4p0i64( %ptrs, i32 8, %mtrue, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv8i64.nxv8p0i64(, i32, , ) + +define @mgather_nxv8i64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (zero), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i8_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i8_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf8 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i8_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf8 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i16_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i16_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf4 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i16_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf4 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i32_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v8, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i32_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf2 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i32_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf2 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i64(i64* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + %v = call @llvm.masked.gather.nxv8i64.nxv8p0i64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv16i64.nxv16p0f64(, i32, , ) + +declare @llvm.experimental.vector.insert.nxv8i64.nxv16i64(, , i64 %idx) +declare @llvm.experimental.vector.insert.nxv8p0i64.nxv16p0i64(, , i64 %idx) + +define void @mgather_nxv16i64( %ptrs0, %ptrs1, %m, %passthru0, %passthru1, * %out) { +; RV32-LABEL: mgather_nxv16i64: +; RV32: # %bb.0: +; RV32-NEXT: vl8re64.v v24, (a0) +; RV32-NEXT: vsetvli a0, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (zero), v8, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: srli a0, a0, 3 +; RV32-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV32-NEXT: vslidedown.vx v0, v0, a0 +; RV32-NEXT: vsetvli a2, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v24, (zero), v12, v0.t +; RV32-NEXT: slli a0, a0, 6 +; RV32-NEXT: add a0, a1, a0 +; RV32-NEXT: vs8r.v v24, (a0) +; RV32-NEXT: vs8r.v v16, (a1) +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv16i64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a3, vlenb +; RV64-NEXT: slli a3, a3, 3 +; RV64-NEXT: sub sp, sp, a3 +; RV64-NEXT: vl8re64.v v24, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vmv8r.v v16, v8 +; RV64-NEXT: vl8re64.v v8, (a1) +; RV64-NEXT: vsetvli a0, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v24, (zero), v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: srli a0, a0, 3 +; RV64-NEXT: vsetvli a1, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v0, a0 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vloxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: slli a0, a0, 6 +; RV64-NEXT: add a0, a2, a0 +; RV64-NEXT: vs8r.v v8, (a0) +; RV64-NEXT: vs8r.v v24, (a2) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %p0 = call @llvm.experimental.vector.insert.nxv8p0i64.nxv16p0i64( undef, %ptrs0, i64 0) + %p1 = call @llvm.experimental.vector.insert.nxv8p0i64.nxv16p0i64( %p0, %ptrs1, i64 8) + + %pt0 = call @llvm.experimental.vector.insert.nxv8i64.nxv16i64( undef, %passthru0, i64 0) + %pt1 = call @llvm.experimental.vector.insert.nxv8i64.nxv16i64( %pt0, %passthru1, i64 8) + + %v = call @llvm.masked.gather.nxv16i64.nxv16p0f64( %p1, i32 8, %m, %pt1) + store %v, * %out + ret void +} + + +declare @llvm.masked.gather.nxv1f16.nxv1p0f16(, i32, , ) + +define @mgather_nxv1f16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv1f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf4,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv1f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf4,tu,mu +; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv1f16.nxv1p0f16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv2f16.nxv2p0f16(, i32, , ) + +define @mgather_nxv2f16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2f16.nxv2p0f16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv4f16.nxv4p0f16(, i32, , ) + +define @mgather_nxv4f16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,tu,mu +; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,tu,mu +; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4f16.nxv4p0f16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_truemask_nxv4f16( %ptrs, %passthru) { +; RV32-LABEL: mgather_truemask_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vloxei32.v v8, (zero), v8 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_truemask_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vloxei64.v v8, (zero), v8 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + %v = call @llvm.masked.gather.nxv4f16.nxv4p0f16( %ptrs, i32 2, %mtrue, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv8f16.nxv8p0f16(, i32, , ) + +define @mgather_nxv8f16( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t +; RV32-NEXT: vmv2r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vmv2r.v v8, v16 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv8f16.nxv8p0f16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i8_nxv8f16(half* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds half, half* %base, %idxs + %v = call @llvm.masked.gather.nxv8f16.nxv8p0f16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i8_nxv8f16(half* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds half, half* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f16.nxv8p0f16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i8_nxv8f16(half* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds half, half* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f16.nxv8p0f16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8f16(half* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v28, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds half, half* %base, %idxs + %v = call @llvm.masked.gather.nxv8f16.nxv8p0f16( %ptrs, i32 2, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv1f32.nxv1p0f32(, i32, , ) + +define @mgather_nxv1f32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv1f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,mf2,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv1f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,mf2,tu,mu +; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv1f32.nxv1p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv2f32.nxv2p0f32(, i32, , ) + +define @mgather_nxv2f32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2f32.nxv2p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv4f32.nxv4p0f32(, i32, , ) + +define @mgather_nxv4f32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,tu,mu +; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t +; RV64-NEXT: vmv2r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_truemask_nxv4f32( %ptrs, %passthru) { +; RV32-LABEL: mgather_truemask_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vloxei32.v v8, (zero), v8 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_truemask_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vloxei64.v v8, (zero), v8 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + %v = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %ptrs, i32 4, %mtrue, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv8f32.nxv8p0f32(, i32, , ) + +define @mgather_nxv8f32( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vmv4r.v v8, v16 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv8f32.nxv8p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i8_nxv8f32(float* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + %v = call @llvm.masked.gather.nxv8f32.nxv8p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i8_nxv8f32(float* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f32.nxv8p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i8_nxv8f32(float* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f32.nxv8p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i16_nxv8f32(float* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + %v = call @llvm.masked.gather.nxv8f32.nxv8p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i16_nxv8f32(float* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f32.nxv8p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i16_nxv8f32(float* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f32.nxv8p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8f32(float* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v8, 2 +; RV32-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v28, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v16, v8 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + %v = call @llvm.masked.gather.nxv8f32.nxv8p0f32( %ptrs, i32 4, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv1f64.nxv1p0f64(, i32, , ) + +define @mgather_nxv1f64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,tu,mu +; RV32-NEXT: vloxei32.v v9, (zero), v8, v0.t +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,tu,mu +; RV64-NEXT: vloxei64.v v9, (zero), v8, v0.t +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv1f64.nxv1p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv2f64.nxv2p0f64(, i32, , ) + +define @mgather_nxv2f64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (zero), v8, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m2,tu,mu +; RV64-NEXT: vloxei64.v v10, (zero), v8, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv2f64.nxv2p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv4f64.nxv4p0f64(, i32, , ) + +define @mgather_nxv4f64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,tu,mu +; RV32-NEXT: vloxei32.v v12, (zero), v8, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,tu,mu +; RV64-NEXT: vloxei64.v v12, (zero), v8, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4f64.nxv4p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_truemask_nxv4f64( %ptrs, %passthru) { +; RV32-LABEL: mgather_truemask_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vloxei32.v v8, (zero), v8 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_truemask_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vloxei64.v v8, (zero), v8 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + %v = call @llvm.masked.gather.nxv4f64.nxv4p0f64( %ptrs, i32 8, %mtrue, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv8f64.nxv8p0f64(, i32, , ) + +define @mgather_nxv8f64( %ptrs, %m, %passthru) { +; RV32-LABEL: mgather_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (zero), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (zero), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i8_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i8_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf8 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i8_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf8 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i16_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v8 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i16_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf4 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i16_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf4 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8i32_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v8, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_sext_nxv8i32_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_sext_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf2 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_sext_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_zext_nxv8i32_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_zext_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf2 v24, v8 +; RV32-NEXT: vsll.vi v8, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_zext_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf2 v24, v8 +; RV64-NEXT: vsll.vi v8, v24, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +define @mgather_baseidx_nxv8f64(double* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsll.vi v8, v8, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV32-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV32-NEXT: vmv8r.v v8, v16 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsetvli a1, zero, e64,m8,tu,mu +; RV64-NEXT: vloxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: vmv8r.v v8, v16 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v = call @llvm.masked.gather.nxv8f64.nxv8p0f64( %ptrs, i32 8, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv16i8.nxv16p0i8(, i32, , ) + +define @mgather_baseidx_nxv16i8(i8* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv16i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV32-NEXT: vsext.vf4 v16, v8 +; RV32-NEXT: vsetvli a1, zero, e8,m2,tu,mu +; RV32-NEXT: vloxei32.v v10, (a0), v16, v0.t +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv16i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; RV64-NEXT: vloxei64.v v10, (a0), v16, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v9 +; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; RV64-NEXT: vloxei64.v v11, (a0), v16, v0.t +; RV64-NEXT: vmv2r.v v8, v10 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i8, i8* %base, %idxs + %v = call @llvm.masked.gather.nxv16i8.nxv16p0i8( %ptrs, i32 2, %m, %passthru) + ret %v +} + +declare @llvm.masked.gather.nxv32i8.nxv32p0i8(, i32, , ) + +define @mgather_baseidx_nxv32i8(i8* %base, %idxs, %m, %passthru) { +; RV32-LABEL: mgather_baseidx_nxv32i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV32-NEXT: vsext.vf4 v16, v8 +; RV32-NEXT: vsetvli a1, zero, e8,m2,tu,mu +; RV32-NEXT: vloxei32.v v12, (a0), v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: vsetvli a2, zero, e8,mf2,ta,mu +; RV32-NEXT: vslidedown.vx v0, v0, a1 +; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV32-NEXT: vsext.vf4 v16, v10 +; RV32-NEXT: vsetvli a1, zero, e8,m2,tu,mu +; RV32-NEXT: vloxei32.v v14, (a0), v16, v0.t +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_baseidx_nxv32i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v8 +; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; RV64-NEXT: vloxei64.v v12, (a0), v16, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v25, v0, a1 +; RV64-NEXT: vmv1r.v v26, v0 +; RV64-NEXT: vsetvli a2, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v9 +; RV64-NEXT: vsetvli a2, zero, e8,m1,tu,mu +; RV64-NEXT: vmv1r.v v0, v25 +; RV64-NEXT: vloxei64.v v13, (a0), v16, v0.t +; RV64-NEXT: slli a2, a1, 1 +; RV64-NEXT: vsetvli a3, zero, e8,mf2,ta,mu +; RV64-NEXT: vslidedown.vx v26, v26, a2 +; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v26, a1 +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v11 +; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; RV64-NEXT: vloxei64.v v15, (a0), v16, v0.t +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsetvli a1, zero, e8,m1,tu,mu +; RV64-NEXT: vmv1r.v v0, v26 +; RV64-NEXT: vloxei64.v v14, (a0), v16, v0.t +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i8, i8* %base, %idxs + %v = call @llvm.masked.gather.nxv32i8.nxv32p0i8( %ptrs, i32 2, %m, %passthru) + ret %v +} -- GitLab From 251fe986afd35bc257a8b043a49bddc98473d565 Mon Sep 17 00:00:00 2001 From: Nigel Perks Date: Mon, 14 Sep 2020 18:17:11 +0100 Subject: [PATCH 0015/1000] [Test][DebugInfo] Check for backend object emission support. The XCore backend does not support object emission. Several tests fail for this reason when XCore is the default target. See staging buildbot builder: clang-xcore-ubuntu-20-x64. So check for backend object emission before running the tests requiring it. Incorporate isConfigurationSupported functionality in isObjectEmissionSupported, to avoid calling them both in the same tests. Differential Revision: https://reviews.llvm.org/D98400 --- .../DebugInfo/DWARF/DWARFDebugInfoTest.cpp | 24 +++++++++---------- llvm/unittests/DebugInfo/DWARF/DwarfUtils.cpp | 7 ++++++ llvm/unittests/DebugInfo/DWARF/DwarfUtils.h | 1 + 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp index 6b644b0a4eba..4cafc9a9258f 100644 --- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp @@ -42,7 +42,7 @@ namespace { template void TestAllForms() { Triple Triple = getDefaultTargetTripleForAddrSize(sizeof(AddrType)); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; // Test that we can decode all DW_FORM values correctly. @@ -456,7 +456,7 @@ TEST(DWARFDebugInfo, TestDWARF32Version5Addr8AllForms) { template void TestChildren() { Triple Triple = getDefaultTargetTripleForAddrSize(sizeof(AddrType)); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; // Test that we can decode DW_FORM_ref_addr values correctly in DWARF 2 with @@ -586,7 +586,7 @@ TEST(DWARFDebugInfo, TestDWARF32Version4Addr8Children) { template void TestReferences() { Triple Triple = getDefaultTargetTripleForAddrSize(sizeof(AddrType)); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; // Test that we can decode DW_FORM_refXXX values correctly in DWARF. @@ -836,7 +836,7 @@ TEST(DWARFDebugInfo, TestDWARF32Version4Addr8References) { template void TestAddresses() { Triple Triple = getDefaultTargetTripleForAddrSize(sizeof(AddrType)); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; // Test the DWARF APIs related to accessing the DW_AT_low_pc and @@ -1008,7 +1008,7 @@ TEST(DWARFDebugInfo, TestDWARF32Version4Addr8Addresses) { TEST(DWARFDebugInfo, TestStringOffsets) { Triple Triple = getNormalizedDefaultTargetTriple(); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; const char *String1 = "Hello"; @@ -1072,7 +1072,7 @@ TEST(DWARFDebugInfo, TestStringOffsets) { TEST(DWARFDebugInfo, TestEmptyStringOffsets) { Triple Triple = getNormalizedDefaultTargetTriple(); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; const char *String1 = "Hello"; @@ -1101,7 +1101,7 @@ TEST(DWARFDebugInfo, TestEmptyStringOffsets) { TEST(DWARFDebugInfo, TestRelations) { Triple Triple = getNormalizedDefaultTargetTriple(); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; // Test the DWARF APIs related to accessing the DW_AT_low_pc and @@ -1288,7 +1288,7 @@ TEST(DWARFDebugInfo, TestDWARFDie) { TEST(DWARFDebugInfo, TestChildIterators) { Triple Triple = getNormalizedDefaultTargetTriple(); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; // Test the DWARF APIs related to iterating across the children of a DIE using @@ -1397,7 +1397,7 @@ TEST(DWARFDebugInfo, TestEmptyChildren) { TEST(DWARFDebugInfo, TestAttributeIterators) { Triple Triple = getNormalizedDefaultTargetTriple(); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; // Test the DWARF APIs related to iterating across all attribute values in a @@ -1459,7 +1459,7 @@ TEST(DWARFDebugInfo, TestAttributeIterators) { TEST(DWARFDebugInfo, TestFindRecurse) { Triple Triple = getNormalizedDefaultTargetTriple(); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; uint16_t Version = 4; @@ -1673,7 +1673,7 @@ TEST(DWARFDebugInfo, TestDwarfToFunctions) { TEST(DWARFDebugInfo, TestFindAttrs) { Triple Triple = getNormalizedDefaultTargetTriple(); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; // Test the DWARFDie::find() and DWARFDie::findRecursively() that take an @@ -1736,7 +1736,7 @@ TEST(DWARFDebugInfo, TestFindAttrs) { TEST(DWARFDebugInfo, TestImplicitConstAbbrevs) { Triple Triple = getNormalizedDefaultTargetTriple(); - if (!isConfigurationSupported(Triple)) + if (!isObjectEmissionSupported(Triple)) return; uint16_t Version = 5; diff --git a/llvm/unittests/DebugInfo/DWARF/DwarfUtils.cpp b/llvm/unittests/DebugInfo/DWARF/DwarfUtils.cpp index 249cfb42271a..20dc7bc8ff12 100644 --- a/llvm/unittests/DebugInfo/DWARF/DwarfUtils.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DwarfUtils.cpp @@ -52,3 +52,10 @@ bool llvm::dwarf::utils::isConfigurationSupported(Triple &T) { std::string Err; return TargetRegistry::lookupTarget(T.getTriple(), Err); } + +bool llvm::dwarf::utils::isObjectEmissionSupported(Triple &T) { + initLLVMIfNeeded(); + std::string Err; + const Target *TheTarget = TargetRegistry::lookupTarget(T.getTriple(), Err); + return TheTarget && TheTarget->hasMCAsmBackend(); +} diff --git a/llvm/unittests/DebugInfo/DWARF/DwarfUtils.h b/llvm/unittests/DebugInfo/DWARF/DwarfUtils.h index 036071e0b567..00eaef25cfba 100644 --- a/llvm/unittests/DebugInfo/DWARF/DwarfUtils.h +++ b/llvm/unittests/DebugInfo/DWARF/DwarfUtils.h @@ -21,6 +21,7 @@ namespace utils { Triple getDefaultTargetTripleForAddrSize(uint8_t AddrSize); Triple getNormalizedDefaultTargetTriple(); bool isConfigurationSupported(Triple &T); +bool isObjectEmissionSupported(Triple &T); } // end namespace utils } // end namespace dwarf -- GitLab From 3495031a39b76b1f85367b68199a79f19dbd9d9e Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Mon, 8 Feb 2021 15:33:23 +0000 Subject: [PATCH 0016/1000] [RISCV] Support scalable-vector masked scatter operations This patch adds support for masked scatter intrinsics on scalable vector types. It is mostly an extension of the earlier masked gather support introduced in D96263, since the addressing mode legalization is the same. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D96486 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 96 +- llvm/lib/Target/RISCV/RISCVISelLowering.h | 2 +- .../test/CodeGen/RISCV/rvv/mscatter-sdnode.ll | 1854 +++++++++++++++++ 3 files changed, 1915 insertions(+), 37 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index ee686102c147..bea946daa473 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -475,6 +475,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); @@ -517,6 +518,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FCOPYSIGN, VT, Legal); setOperationAction(ISD::MGATHER, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); @@ -695,6 +697,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.hasStdExtV()) { setTargetDAGCombine(ISD::FCOPYSIGN); setTargetDAGCombine(ISD::MGATHER); + setTargetDAGCombine(ISD::MSCATTER); } } @@ -1719,7 +1722,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::FCOPYSIGN: return lowerFixedLengthVectorFCOPYSIGNToRVV(Op, DAG); case ISD::MGATHER: - return lowerMGATHER(Op, DAG); + case ISD::MSCATTER: + return lowerMGATHERMSCATTER(Op, DAG); } } @@ -3467,39 +3471,50 @@ SDValue RISCVTargetLowering::lowerToScalableOp(SDValue Op, SelectionDAG &DAG, // "unsigned unscaled" addressing mode; indices are implicitly zero-extended or // truncated to XLEN and are treated as byte offsets. Any signed or scaled // indexing is extended to the XLEN value type and scaled accordingly. -SDValue RISCVTargetLowering::lowerMGATHER(SDValue Op, SelectionDAG &DAG) const { - MaskedGatherSDNode *N = cast(Op.getNode()); +SDValue RISCVTargetLowering::lowerMGATHERMSCATTER(SDValue Op, + SelectionDAG &DAG) const { + auto *N = cast(Op.getNode()); SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); - SDValue PassThru = N->getPassThru(); MVT XLenVT = Subtarget.getXLenVT(); assert(N->getBasePtr().getSimpleValueType() == XLenVT && "Unexpected pointer type"); - // Targets have to explicitly opt-in for extending vector loads. - assert(N->getExtensionType() == ISD::NON_EXTLOAD && + // Targets have to explicitly opt-in for extending vector loads and + // truncating vector stores. + const auto *MGN = dyn_cast(N); + const auto *MSN = dyn_cast(N); + assert((!MGN || MGN->getExtensionType() == ISD::NON_EXTLOAD) && "Unexpected extending MGATHER"); + assert((!MSN || !MSN->isTruncatingStore()) && + "Unexpected extending MSCATTER"); - SDValue VL = getDefaultVLOps(VT, VT, DL, DAG, Subtarget).second; // If the mask is known to be all ones, optimize to an unmasked intrinsic; // the selection of the masked intrinsics doesn't do this for us. - if (ISD::isConstantSplatVectorAllOnes(Mask.getNode())) { - SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vloxei, DL, XLenVT); - SDValue Ops[] = {N->getChain(), IntID, N->getBasePtr(), Index, VL}; - return DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, - DAG.getVTList(VT, MVT::Other), Ops, - N->getMemoryVT(), N->getMemOperand()); - } + unsigned IntID = 0; + MVT IndexVT = Index.getSimpleValueType(); + SDValue VL = getDefaultVLOps(IndexVT, IndexVT, DL, DAG, Subtarget).second; + bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode()); - SDValue IntID = - DAG.getTargetConstant(Intrinsic::riscv_vloxei_mask, DL, XLenVT); - SDValue Ops[] = {N->getChain(), IntID, PassThru, N->getBasePtr(), - Index, Mask, VL}; - return DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, - DAG.getVTList(VT, MVT::Other), Ops, - N->getMemoryVT(), N->getMemOperand()); + if (IsUnmasked) + IntID = MGN ? Intrinsic::riscv_vloxei : Intrinsic::riscv_vsoxei; + else + IntID = MGN ? Intrinsic::riscv_vloxei_mask : Intrinsic::riscv_vsoxei_mask; + SmallVector Ops{N->getChain(), + DAG.getTargetConstant(IntID, DL, XLenVT)}; + if (MSN) + Ops.push_back(MSN->getValue()); + else if (!IsUnmasked) + Ops.push_back(MGN->getPassThru()); + Ops.push_back(N->getBasePtr()); + Ops.push_back(Index); + if (!IsUnmasked) + Ops.push_back(Mask); + Ops.push_back(VL); + return DAG.getMemIntrinsicNode( + MGN ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, N->getVTList(), + Ops, N->getMemoryVT(), N->getMemOperand()); } // Returns the opcode of the target-specific SDNode that implements the 32-bit @@ -4519,18 +4534,19 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, return DAG.getNode(ISD::FCOPYSIGN, DL, VT, N->getOperand(0), DAG.getNode(ISD::FNEG, DL, VT, NewFPExtRound)); } - case ISD::MGATHER: { + case ISD::MGATHER: + case ISD::MSCATTER: { if (!DCI.isBeforeLegalize()) break; - MaskedGatherSDNode *MGN = cast(N); - SDValue Index = MGN->getIndex(); + MaskedGatherScatterSDNode *MGSN = cast(N); + SDValue Index = MGSN->getIndex(); EVT IndexVT = Index.getValueType(); MVT XLenVT = Subtarget.getXLenVT(); // RISCV indexed loads only support the "unsigned unscaled" addressing // mode, so anything else must be manually legalized. - bool NeedsIdxLegalization = - MGN->isIndexScaled() || - (MGN->isIndexSigned() && IndexVT.getVectorElementType().bitsLT(XLenVT)); + bool NeedsIdxLegalization = MGSN->isIndexScaled() || + (MGSN->isIndexSigned() && + IndexVT.getVectorElementType().bitsLT(XLenVT)); if (!NeedsIdxLegalization) break; @@ -4541,13 +4557,13 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, // LLVM's legalization take care of the splitting. if (IndexVT.getVectorElementType().bitsLT(XLenVT)) { IndexVT = IndexVT.changeVectorElementType(XLenVT); - Index = DAG.getNode(MGN->isIndexSigned() ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND, + Index = DAG.getNode(MGSN->isIndexSigned() ? ISD::SIGN_EXTEND + : ISD::ZERO_EXTEND, DL, IndexVT, Index); } unsigned Scale = N->getConstantOperandVal(5); - if (MGN->isIndexScaled() && Scale != 1) { + if (MGSN->isIndexScaled() && Scale != 1) { // Manually scale the indices by the element size. // TODO: Sanitize the scale operand here? assert(isPowerOf2_32(Scale) && "Expecting power-of-two types"); @@ -4556,11 +4572,19 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, } ISD::MemIndexType NewIndexTy = ISD::UNSIGNED_UNSCALED; - return DAG.getMaskedGather( - N->getVTList(), MGN->getMemoryVT(), DL, - {MGN->getChain(), MGN->getPassThru(), MGN->getMask(), MGN->getBasePtr(), - Index, MGN->getScale()}, - MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType()); + if (const auto *MGN = dyn_cast(N)) { + return DAG.getMaskedGather( + N->getVTList(), MGSN->getMemoryVT(), DL, + {MGSN->getChain(), MGN->getPassThru(), MGSN->getMask(), + MGSN->getBasePtr(), Index, MGN->getScale()}, + MGN->getMemOperand(), NewIndexTy, MGN->getExtensionType()); + } + const auto *MSN = cast(N); + return DAG.getMaskedScatter( + N->getVTList(), MGSN->getMemoryVT(), DL, + {MGSN->getChain(), MSN->getValue(), MGSN->getMask(), MGSN->getBasePtr(), + Index, MGSN->getScale()}, + MGSN->getMemOperand(), NewIndexTy, MSN->isTruncatingStore()); } } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 1aea84dd258a..35fdf2921e22 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -477,7 +477,7 @@ private: SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerMGATHER(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerMGATHERMSCATTER(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorMaskedLoadToRVV(SDValue Op, diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll new file mode 100644 index 000000000000..424ea2f90458 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -0,0 +1,1854 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+experimental-zfh,+experimental-v -target-abi=lp64d \ +; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefix=RV64 + +declare void @llvm.masked.scatter.nxv1i8.nxv1p0i8(, , i32, ) + +define void @mscatter_nxv1i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf8,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1i8.nxv1p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2i8.nxv2p0i8(, , i32, ) + +define void @mscatter_nxv2i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2i8.nxv2p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_nxv2i16_truncstore_nxv2i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i16_truncstore_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsoxei32.v v25, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i16_truncstore_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsoxei64.v v25, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i8.nxv2p0i8( %tval, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_nxv2i32_truncstore_nxv2i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i32_truncstore_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV32-NEXT: vnsrl.wi v26, v25, 0 +; RV32-NEXT: vsoxei32.v v26, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i32_truncstore_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV64-NEXT: vnsrl.wi v26, v25, 0 +; RV64-NEXT: vsoxei64.v v26, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i8.nxv2p0i8( %tval, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_nxv2i64_truncstore_nxv2i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i64_truncstore_nxv2i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vnsrl.wi v26, v25, 0 +; RV32-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV32-NEXT: vnsrl.wi v25, v26, 0 +; RV32-NEXT: vsoxei32.v v25, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i64_truncstore_nxv2i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vnsrl.wi v26, v25, 0 +; RV64-NEXT: vsetvli a0, zero, e8,mf4,ta,mu +; RV64-NEXT: vnsrl.wi v25, v26, 0 +; RV64-NEXT: vsoxei64.v v25, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i8.nxv2p0i8( %tval, %ptrs, i32 1, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4i8.nxv4p0i8(, , i32, ) + +define void @mscatter_nxv4i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i8.nxv4p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_truemask_nxv4i8( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4i8.nxv4p0i8( %val, %ptrs, i32 1, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8i8.nxv8p0i8(, , i32, ) + +define void @mscatter_nxv8i8( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e8,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e8,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8i8.nxv8p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8( %val, i8* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v9 +; RV32-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v9 +; RV64-NEXT: vsetvli a1, zero, e8,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i8, i8* %base, %idxs + call void @llvm.masked.scatter.nxv8i8.nxv8p0i8( %val, %ptrs, i32 1, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1i16.nxv1p0i16(, , i32, ) + +define void @mscatter_nxv1i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1i16.nxv1p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2i16.nxv2p0i16(, , i32, ) + +define void @mscatter_nxv2i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2i16.nxv2p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_nxv2i32_truncstore_nxv2i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i32_truncstore_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsoxei32.v v25, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i32_truncstore_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsoxei64.v v25, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i16.nxv2p0i16( %tval, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_nxv2i64_truncstore_nxv2i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i64_truncstore_nxv2i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vnsrl.wi v26, v25, 0 +; RV32-NEXT: vsoxei32.v v26, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i64_truncstore_nxv2i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vnsrl.wi v26, v25, 0 +; RV64-NEXT: vsoxei64.v v26, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i16.nxv2p0i16( %tval, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4i16.nxv4p0i16(, , i32, ) + +define void @mscatter_nxv4i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_truemask_nxv4i16( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %val, %ptrs, i32 2, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8i16.nxv8p0i16(, , i32, ) + +define void @mscatter_nxv8i16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8i16( %val, i16* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i16, i16* %base, %idxs + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8i16( %val, i16* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i16, i16* %base, %eidxs + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8i16( %val, i16* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i16, i16* %base, %eidxs + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16( %val, i16* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i16, i16* %base, %idxs + call void @llvm.masked.scatter.nxv8i16.nxv8p0i16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1i32.nxv1p0i32(, , i32, ) + +define void @mscatter_nxv1i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1i32.nxv1p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2i32.nxv2p0i32(, , i32, ) + +define void @mscatter_nxv2i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2i32.nxv2p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_nxv2i64_truncstore_nxv2i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i64_truncstore_nxv2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vnsrl.wi v25, v8, 0 +; RV32-NEXT: vsoxei32.v v25, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i64_truncstore_nxv2i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vnsrl.wi v25, v8, 0 +; RV64-NEXT: vsoxei64.v v25, (zero), v10, v0.t +; RV64-NEXT: ret + %tval = trunc %val to + call void @llvm.masked.scatter.nxv2i32.nxv2p0i32( %tval, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4i32.nxv4p0i32(, , i32, ) + +define void @mscatter_nxv4i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_truemask_nxv4i32( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %val, %ptrs, i32 4, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8i32.nxv8p0i32(, , i32, ) + +define void @mscatter_nxv8i32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i16_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i16_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i32, i32* %base, %eidxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i32( %val, i32* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i32, i32* %base, %idxs + call void @llvm.masked.scatter.nxv8i32.nxv8p0i32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1i64.nxv1p0i64(, , i32, ) + +define void @mscatter_nxv1i64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1i64.nxv1p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2i64.nxv2p0i64(, , i32, ) + +define void @mscatter_nxv2i64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2i64.nxv2p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4i64.nxv4p0i64(, , i32, ) + +define void @mscatter_nxv4i64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_truemask_nxv4i64( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( %val, %ptrs, i32 8, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8i64.nxv8p0i64(, , i32, ) + +define void @mscatter_nxv8i64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v16 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf8 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf8 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v16 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i16_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i16_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i32_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v16, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i32_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf2 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i32_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf2 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds i64, i64* %base, %eidxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i64( %val, i64* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds i64, i64* %base, %idxs + call void @llvm.masked.scatter.nxv8i64.nxv8p0i64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1f16.nxv1p0f16(, , i32, ) + +define void @mscatter_nxv1f16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1f16.nxv1p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2f16.nxv2p0f16(, , i32, ) + +define void @mscatter_nxv2f16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2f16.nxv2p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4f16.nxv4p0f16(, , i32, ) + +define void @mscatter_nxv4f16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f16.nxv4p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_truemask_nxv4f16( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4f16.nxv4p0f16( %val, %ptrs, i32 2, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8f16.nxv8p0f16(, , i32, ) + +define void @mscatter_nxv8f16( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8f16( %val, half* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds half, half* %base, %idxs + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8f16( %val, half* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds half, half* %base, %eidxs + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8f16( %val, half* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds half, half* %base, %eidxs + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +define void @mscatter_baseidx_nxv8f16( %val, half* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v10 +; RV32-NEXT: vsll.vi v28, v28, 1 +; RV32-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8f16: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v10 +; RV64-NEXT: vsll.vi v16, v16, 1 +; RV64-NEXT: vsetvli a1, zero, e16,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds half, half* %base, %idxs + call void @llvm.masked.scatter.nxv8f16.nxv8p0f16( %val, %ptrs, i32 2, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1f32.nxv1p0f32(, , i32, ) + +define void @mscatter_nxv1f32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,mf2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1f32.nxv1p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2f32.nxv2p0f32(, , i32, ) + +define void @mscatter_nxv2f32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2f32.nxv2p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4f32.nxv4p0f32(, , i32, ) + +define void @mscatter_nxv4f32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_truemask_nxv4f32( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( %val, %ptrs, i32 4, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8f32.nxv8p0f32(, , i32, ) + +define void @mscatter_nxv8f32( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e32,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf4 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i16_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i16_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vzext.vf2 v28, v12 +; RV32-NEXT: vsll.vi v28, v28, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds float, float* %base, %eidxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +define void @mscatter_baseidx_nxv8f32( %val, float* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v12, 2 +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8f32: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v16, v12 +; RV64-NEXT: vsll.vi v16, v16, 2 +; RV64-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds float, float* %base, %idxs + call void @llvm.masked.scatter.nxv8f32.nxv8p0f32( %val, %ptrs, i32 4, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv1f64.nxv1p0f64(, , i32, ) + +define void @mscatter_nxv1f64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv1f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v9, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv1f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m1,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v9, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv1f64.nxv1p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv2f64.nxv2p0f64(, , i32, ) + +define void @mscatter_nxv2f64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv2f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v10, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv2f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m2,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v10, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv2f64.nxv2p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv4f64.nxv4p0f64(, , i32, ) + +define void @mscatter_nxv4f64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_truemask_nxv4f64( %val, %ptrs) { +; RV32-LABEL: mscatter_truemask_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_truemask_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m4,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v12 +; RV64-NEXT: ret + %mhead = insertelement undef, i1 1, i32 0 + %mtrue = shufflevector %mhead, undef, zeroinitializer + call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( %val, %ptrs, i32 8, %mtrue) + ret void +} + +declare void @llvm.masked.scatter.nxv8f64.nxv8p0f64(, , i32, ) + +define void @mscatter_nxv8f64( %val, %ptrs, %m) { +; RV32-LABEL: mscatter_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i8_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf4 v28, v16 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i8_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf8 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i8_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf8 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i8_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf8 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i16_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsext.vf2 v28, v16 +; RV32-NEXT: vsll.vi v28, v28, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i16_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i16_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf4 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i16_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf4 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8i32_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e32,m4,ta,mu +; RV32-NEXT: vsll.vi v28, v16, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_sext_nxv8i32_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsext.vf2 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_sext_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = sext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_zext_nxv8i32_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vzext.vf2 v24, v16 +; RV32-NEXT: vsll.vi v16, v24, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_zext_nxv8i32_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vzext.vf2 v24, v16 +; RV64-NEXT: vsll.vi v16, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %eidxs = zext %idxs to + %ptrs = getelementptr inbounds double, double* %base, %eidxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv8f64( %val, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv8f64: +; RV32: # %bb.0: +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsll.vi v16, v16, 3 +; RV32-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv8f64: +; RV64: # %bb.0: +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsll.vi v16, v16, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v16, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + call void @llvm.masked.scatter.nxv8f64.nxv8p0f64( %val, %ptrs, i32 8, %m) + ret void +} + +declare void @llvm.masked.scatter.nxv16f64.nxv16p0f64(, , i32, ) + +declare @llvm.experimental.vector.insert.nxv8f64.nxv16f64(, , i64) +declare @llvm.experimental.vector.insert.nxv8p0f64.nxv16p0f64(, , i64) + +define void @mscatter_nxv16f64( %val0, %val1, %ptrs0, %ptrs1, %m) { +; RV32-LABEL: mscatter_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vl4re32.v v28, (a0) +; RV32-NEXT: vl4re32.v v24, (a1) +; RV32-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (zero), v28, v0.t +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: srli a0, a0, 3 +; RV32-NEXT: vsetvli a1, zero, e8,mf4,ta,mu +; RV32-NEXT: vslidedown.vx v0, v0, a0 +; RV32-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v16, (zero), v24, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a2, a2, 3 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: vl8re64.v v24, (a0) +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; RV64-NEXT: vl8re64.v v16, (a1) +; RV64-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV64-NEXT: vsoxei64.v v8, (zero), v24, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: srli a0, a0, 3 +; RV64-NEXT: vsetvli a1, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v0, a0 +; RV64-NEXT: vsetvli a0, zero, e64,m8,ta,mu +; RV64-NEXT: addi a0, sp, 16 +; RV64-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; RV64-NEXT: vsoxei64.v v8, (zero), v16, v0.t +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: slli a0, a0, 3 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %p0 = call @llvm.experimental.vector.insert.nxv8p0f64.nxv16p0f64( undef, %ptrs0, i64 0) + %p1 = call @llvm.experimental.vector.insert.nxv8p0f64.nxv16p0f64( %p0, %ptrs1, i64 8) + %v0 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( undef, %val0, i64 0) + %v1 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( %v0, %val1, i64 8) + call void @llvm.masked.scatter.nxv16f64.nxv16p0f64( %v1, %p1, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv16i8_nxv16f64( %val0, %val1, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv16i8_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vl2r.v v2, (a1) +; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV32-NEXT: vsext.vf4 v24, v2 +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV32-NEXT: vslidedown.vx v0, v0, a1 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv16i8_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vl2r.v v2, (a1) +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v24, v2 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf8 v8, v3 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v0 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( undef, %val0, i64 0) + %v1 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( %v0, %val1, i64 8) + call void @llvm.masked.scatter.nxv16f64.nxv16p0f64( %v1, %ptrs, i32 8, %m) + ret void +} + +define void @mscatter_baseidx_nxv16i16_nxv16f64( %val0, %val1, double* %base, %idxs, %m) { +; RV32-LABEL: mscatter_baseidx_nxv16i16_nxv16f64: +; RV32: # %bb.0: +; RV32-NEXT: vl4re16.v v4, (a1) +; RV32-NEXT: vsetvli a1, zero, e32,m8,ta,mu +; RV32-NEXT: vsext.vf2 v24, v4 +; RV32-NEXT: vsll.vi v24, v24, 3 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v8, (a0), v24, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 3 +; RV32-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV32-NEXT: vslidedown.vx v0, v0, a1 +; RV32-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV32-NEXT: vsoxei32.v v16, (a0), v28, v0.t +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_baseidx_nxv16i16_nxv16f64: +; RV64: # %bb.0: +; RV64-NEXT: vl4re16.v v4, (a1) +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v24, v4 +; RV64-NEXT: vsll.vi v24, v24, 3 +; RV64-NEXT: vsoxei64.v v8, (a0), v24, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 3 +; RV64-NEXT: vsetvli a2, zero, e8,mf4,ta,mu +; RV64-NEXT: vslidedown.vx v0, v0, a1 +; RV64-NEXT: vsetvli a1, zero, e64,m8,ta,mu +; RV64-NEXT: vsext.vf4 v8, v6 +; RV64-NEXT: vsll.vi v8, v8, 3 +; RV64-NEXT: vsoxei64.v v16, (a0), v8, v0.t +; RV64-NEXT: ret + %ptrs = getelementptr inbounds double, double* %base, %idxs + %v0 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( undef, %val0, i64 0) + %v1 = call @llvm.experimental.vector.insert.nxv8f64.nxv16f64( %v0, %val1, i64 8) + call void @llvm.masked.scatter.nxv16f64.nxv16p0f64( %v1, %ptrs, i32 8, %m) + ret void +} -- GitLab From b1afa187c8ee0a42c66aace709069dbd195d012f Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Mar 2021 10:26:46 +0000 Subject: [PATCH 0017/1000] [DAG] SelectionDAG::isSplatValue - add ISD::ABS handling Add ISD::ABS to the existing unary instructions handling for splat detection This is similar to D83605, but doesn't appear to need to touch any of the wasm refactoring. Differential Revision: https://reviews.llvm.org/D98778 --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 1 + .../WebAssembly/simd-shift-complex-splats.ll | 52 +++---------------- 2 files changed, 9 insertions(+), 44 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index da891e1c2425..dedc25c079eb 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2470,6 +2470,7 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts, } break; } + case ISD::ABS: case ISD::TRUNCATE: case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: diff --git a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll index 4582bc62216a..00a963b959ed 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-shift-complex-splats.ll @@ -28,29 +28,11 @@ define <16 x i8> @shl_add(<16 x i8> %v, i8 %a, i8 %b) { ; CHECK-LABEL: shl_abs: ; CHECK-NEXT: .functype shl_abs (v128, i32) -> (v128) -; CHECK-NEXT: i8x16.extract_lane_u $push8=, $0, 0 ; CHECK-NEXT: i8x16.splat $push0=, $1 -; CHECK-NEXT: i8x16.abs $push98=, $pop0 -; CHECK-NEXT: local.tee $push97=, $2=, $pop98 -; CHECK-NEXT: i8x16.extract_lane_u $push6=, $pop97, 0 -; CHECK-NEXT: i32.const $push2=, 7 -; CHECK-NEXT: i32.and $push7=, $pop6, $pop2 -; CHECK-NEXT: i32.shl $push9=, $pop8, $pop7 -; CHECK-NEXT: i8x16.splat $push10=, $pop9 -; CHECK-NEXT: i8x16.extract_lane_u $push4=, $0, 1 -; CHECK-NEXT: i8x16.extract_lane_u $push1=, $2, 1 -; CHECK-NEXT: i32.const $push96=, 7 -; CHECK-NEXT: i32.and $push3=, $pop1, $pop96 -; CHECK-NEXT: i32.shl $push5=, $pop4, $pop3 -; CHECK-NEXT: i8x16.replace_lane $push11=, $pop10, 1, $pop5 -; ... -; CHECK: i8x16.extract_lane_u $push79=, $0, 15 -; CHECK-NEXT: i8x16.extract_lane_u $push77=, $2, 15 -; CHECK-NEXT: i32.const $push82=, 7 -; CHECK-NEXT: i32.and $push78=, $pop77, $pop82 -; CHECK-NEXT: i32.shl $push80=, $pop79, $pop78 -; CHECK-NEXT: i8x16.replace_lane $push81=, $pop76, 15, $pop80 -; CHECK-NEXT: return $pop81 +; CHECK-NEXT: i8x16.abs $push1=, $pop0 +; CHECK-NEXT: i8x16.extract_lane_u $push2=, $pop1, 0 +; CHECK-NEXT: i8x16.shl $push3=, $0, $pop2 +; CHECK-NEXT: return $pop3 define <16 x i8> @shl_abs(<16 x i8> %v, i8 %a) { %t1 = insertelement <16 x i8> undef, i8 %a, i32 0 %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer @@ -63,32 +45,14 @@ define <16 x i8> @shl_abs(<16 x i8> %v, i8 %a) { ; CHECK-LABEL: shl_abs_add: ; CHECK-NEXT: .functype shl_abs_add (v128, i32, i32) -> (v128) -; CHECK-NEXT: i8x16.extract_lane_u $push11=, $0, 0 ; CHECK-NEXT: i8x16.splat $push1=, $1 ; CHECK-NEXT: i8x16.splat $push0=, $2 ; CHECK-NEXT: i8x16.add $push2=, $pop1, $pop0 ; CHECK-NEXT: i8x16.shuffle $push3=, $pop2, $0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -; CHECK-NEXT: i8x16.abs $push101=, $pop3 -; CHECK-NEXT: local.tee $push100=, $3=, $pop101 -; CHECK-NEXT: i8x16.extract_lane_u $push9=, $pop100, 0 -; CHECK-NEXT: i32.const $push5=, 7 -; CHECK-NEXT: i32.and $push10=, $pop9, $pop5 -; CHECK-NEXT: i32.shl $push12=, $pop11, $pop10 -; CHECK-NEXT: i8x16.splat $push13=, $pop12 -; CHECK-NEXT: i8x16.extract_lane_u $push7=, $0, 1 -; CHECK-NEXT: i8x16.extract_lane_u $push4=, $3, 1 -; CHECK-NEXT: i32.const $push99=, 7 -; CHECK-NEXT: i32.and $push6=, $pop4, $pop99 -; CHECK-NEXT: i32.shl $push8=, $pop7, $pop6 -; CHECK-NEXT: i8x16.replace_lane $push14=, $pop13, 1, $pop8 -; ... -; CHECK: i8x16.extract_lane_u $push82=, $0, 15 -; CHECK-NEXT: i8x16.extract_lane_u $push80=, $3, 15 -; CHECK-NEXT: i32.const $push85=, 7 -; CHECK-NEXT: i32.and $push81=, $pop80, $pop85 -; CHECK-NEXT: i32.shl $push83=, $pop82, $pop81 -; CHECK-NEXT: i8x16.replace_lane $push84=, $pop79, 15, $pop83 -; CHECK-NEXT: return $pop84 +; CHECK-NEXT: i8x16.abs $push4=, $pop3 +; CHECK-NEXT: i8x16.extract_lane_u $push5=, $pop4, 0 +; CHECK-NEXT: i8x16.shl $push6=, $0, $pop5 +; CHECK-NEXT: return $pop6 define <16 x i8> @shl_abs_add(<16 x i8> %v, i8 %a, i8 %b) { %t1 = insertelement <16 x i8> undef, i8 %a, i32 0 %va = shufflevector <16 x i8> %t1, <16 x i8> undef, <16 x i32> zeroinitializer -- GitLab From f134a7158b1eb1b1a63dc402b46a57bf6e5ec897 Mon Sep 17 00:00:00 2001 From: Alexey Lapshin Date: Fri, 12 Mar 2021 01:31:06 +0300 Subject: [PATCH 0018/1000] [llvm-objcopy] remove split dwo file creation from executeObjcopyOnBinary. This patch removes creation of the resulting file from the executeObjcopyOnBinary() function. For the most use cases, the executeObjcopyOnBinary receives output file as a parameter - raw_ostream &Out. The splitting .dwo file is implemented differently: file containg .dwo tables is created inside executeObjcopyOnBinary(). When objcopy functionality would be moved into separate library, current implementation will become inconvenient. The goal of that refactoring is to separate concerns: It might be convenient to to do dwo tables splitting but to create resulting file differently. Differential Revision: https://reviews.llvm.org/D98582 --- llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp | 47 ++++--------- llvm/tools/llvm-objcopy/llvm-objcopy.cpp | 76 ++++++++++++++-------- 2 files changed, 61 insertions(+), 62 deletions(-) diff --git a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp index 0cf0172c3550..d139814617b1 100644 --- a/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp +++ b/llvm/tools/llvm-objcopy/ELF/ELFObjcopy.cpp @@ -173,32 +173,6 @@ static Error makeStringError(std::error_code EC, const Twine &Msg, return createStringError(EC, FullMsg.c_str(), std::forward(Args)...); } -static Error splitDWOToFile(const CopyConfig &Config, const Reader &Reader, - StringRef File, ElfType OutputElfType) { - Expected> DWOFile = Reader.create(false); - if (!DWOFile) - return DWOFile.takeError(); - - auto OnlyKeepDWOPred = [&DWOFile](const SectionBase &Sec) { - return onlyKeepDWOPred(**DWOFile, Sec); - }; - if (Error E = - (*DWOFile)->removeSections(Config.AllowBrokenLinks, OnlyKeepDWOPred)) - return E; - if (Config.OutputArch) { - (*DWOFile)->Machine = Config.OutputArch.getValue().EMachine; - (*DWOFile)->OSABI = Config.OutputArch.getValue().OSABI; - } - - return writeToFile(File, [&](raw_ostream &OutFile) -> Error { - std::unique_ptr Writer = - createWriter(Config, **DWOFile, OutFile, OutputElfType); - if (Error E = Writer->finalize()) - return E; - return Writer->write(); - }); -} - static Error dumpSectionToFile(StringRef SecName, StringRef Filename, Object &Obj) { for (auto &Sec : Obj.sections()) { @@ -374,7 +348,7 @@ static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) { }; } - if (Config.StripDWO || !Config.SplitDWO.empty()) + if (Config.StripDWO) RemovePred = [RemovePred](const SectionBase &Sec) { return isDWOSection(Sec) || RemovePred(Sec); }; @@ -532,21 +506,22 @@ static Error replaceAndRemoveSections(const CopyConfig &Config, Object &Obj) { // any previous removals. Lastly whether or not something is removed shouldn't // depend a) on the order the options occur in or b) on some opaque priority // system. The only priority is that keeps/copies overrule removes. -static Error handleArgs(const CopyConfig &Config, Object &Obj, - const Reader &Reader, ElfType OutputElfType) { +static Error handleArgs(const CopyConfig &Config, Object &Obj) { if (Config.StripSwiftSymbols || Config.KeepUndefined) return createStringError(llvm::errc::invalid_argument, "option not supported by llvm-objcopy for ELF"); - if (!Config.SplitDWO.empty()) - if (Error E = - splitDWOToFile(Config, Reader, Config.SplitDWO, OutputElfType)) - return E; if (Config.OutputArch) { Obj.Machine = Config.OutputArch.getValue().EMachine; Obj.OSABI = Config.OutputArch.getValue().OSABI; } + if (!Config.SplitDWO.empty() && Config.ExtractDWO) { + return Obj.removeSections( + Config.AllowBrokenLinks, + [&Obj](const SectionBase &Sec) { return onlyKeepDWOPred(Obj, Sec); }); + } + // Dump sections before add/remove for compatibility with GNU objcopy. for (StringRef Flag : Config.DumpSection) { StringRef SectionName; @@ -706,7 +681,7 @@ Error executeObjcopyOnIHex(const CopyConfig &Config, MemoryBuffer &In, const ElfType OutputElfType = getOutputElfType(Config.OutputArch.getValueOr(MachineInfo())); - if (Error E = handleArgs(Config, **Obj, Reader, OutputElfType)) + if (Error E = handleArgs(Config, **Obj)) return E; return writeOutput(Config, **Obj, Out, OutputElfType); } @@ -724,7 +699,7 @@ Error executeObjcopyOnRawBinary(const CopyConfig &Config, MemoryBuffer &In, // (-B). const ElfType OutputElfType = getOutputElfType(Config.OutputArch.getValueOr(MachineInfo())); - if (Error E = handleArgs(Config, **Obj, Reader, OutputElfType)) + if (Error E = handleArgs(Config, **Obj)) return E; return writeOutput(Config, **Obj, Out, OutputElfType); } @@ -741,7 +716,7 @@ Error executeObjcopyOnBinary(const CopyConfig &Config, Config.OutputArch ? getOutputElfType(Config.OutputArch.getValue()) : getOutputElfType(In); - if (Error E = handleArgs(Config, **Obj, Reader, OutputElfType)) + if (Error E = handleArgs(Config, **Obj)) return createFileError(Config.InputFilename, std::move(E)); if (Error E = writeOutput(Config, **Obj, Out, OutputElfType)) diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp index 68b5e97d09ed..a8a570abaab1 100644 --- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp @@ -322,44 +322,68 @@ static Error executeObjcopy(CopyConfig &Config) { Stat.permissions(static_cast(0777)); } - using ProcessRawFn = Error (*)(CopyConfig &, MemoryBuffer &, raw_ostream &); - ProcessRawFn ProcessRaw; - switch (Config.InputFormat) { - case FileFormat::Binary: - ProcessRaw = executeObjcopyOnRawBinary; - break; - case FileFormat::IHex: - ProcessRaw = executeObjcopyOnIHex; - break; - default: - ProcessRaw = nullptr; - } + std::function ObjcopyFunc; - if (ProcessRaw) { - auto BufOrErr = MemoryBuffer::getFileOrSTDIN(Config.InputFilename); + OwningBinary BinaryHolder; + std::unique_ptr MemoryBufferHolder; + + if (Config.InputFormat == FileFormat::Binary || + Config.InputFormat == FileFormat::IHex) { + ErrorOr> BufOrErr = + MemoryBuffer::getFileOrSTDIN(Config.InputFilename); if (!BufOrErr) return createFileError(Config.InputFilename, BufOrErr.getError()); - - if (Error E = writeToFile( - Config.OutputFilename, [&](raw_ostream &OutFile) -> Error { - return ProcessRaw(Config, *BufOrErr->get(), OutFile); - })) - return E; + MemoryBufferHolder = std::move(*BufOrErr); + + if (Config.InputFormat == FileFormat::Binary) + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + // Handle FileFormat::Binary. + return executeObjcopyOnRawBinary(Config, *MemoryBufferHolder, OutFile); + }; + else + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + // Handle FileFormat::IHex. + return executeObjcopyOnIHex(Config, *MemoryBufferHolder, OutFile); + }; } else { Expected> BinaryOrErr = createBinary(Config.InputFilename); if (!BinaryOrErr) return createFileError(Config.InputFilename, BinaryOrErr.takeError()); + BinaryHolder = std::move(*BinaryOrErr); - if (Archive *Ar = dyn_cast(BinaryOrErr.get().getBinary())) { + if (Archive *Ar = dyn_cast(BinaryHolder.getBinary())) { + // Handle Archive. if (Error E = executeObjcopyOnArchive(Config, *Ar)) return E; } else { - if (Error E = writeToFile( - Config.OutputFilename, [&](raw_ostream &OutFile) -> Error { - return executeObjcopyOnBinary( - Config, *BinaryOrErr.get().getBinary(), OutFile); - })) + // Handle llvm::object::Binary. + ObjcopyFunc = [&](raw_ostream &OutFile) -> Error { + return executeObjcopyOnBinary(Config, *BinaryHolder.getBinary(), + OutFile); + }; + } + } + + if (ObjcopyFunc) { + if (Config.SplitDWO.empty()) { + // Apply transformations described by Config and store result into + // Config.OutputFilename using specified ObjcopyFunc function. + if (Error E = writeToFile(Config.OutputFilename, ObjcopyFunc)) + return E; + } else { + Config.ExtractDWO = true; + Config.StripDWO = false; + // Copy .dwo tables from the Config.InputFilename into Config.SplitDWO + // file using specified ObjcopyFunc function. + if (Error E = writeToFile(Config.SplitDWO, ObjcopyFunc)) + return E; + Config.ExtractDWO = false; + Config.StripDWO = true; + // Apply transformations described by Config, remove .dwo tables and + // store result into Config.OutputFilename using specified ObjcopyFunc + // function. + if (Error E = writeToFile(Config.OutputFilename, ObjcopyFunc)) return E; } } -- GitLab From d9b5338cfbd49fbc019d03e7151399eab77b884b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Mar 2021 11:07:16 +0000 Subject: [PATCH 0019/1000] [ARM] Regenerate select-imm.ll tests --- llvm/test/CodeGen/ARM/select-imm.ll | 981 +++++++++++++++++++++------- 1 file changed, 747 insertions(+), 234 deletions(-) diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll index 5251f71e64a0..4682c2fb1bf0 100644 --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -1,104 +1,212 @@ -; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s --check-prefix=ARM - -; RUN: llc -mtriple=arm-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \ -; RUN: | FileCheck %s --check-prefix=ARMT2 - -; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m0 %s -o - \ -; RUN: | FileCheck %s --check-prefix=THUMB1 - -; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \ -; RUN: | FileCheck %s --check-prefix=THUMB2 - -; RUN: llc -mtriple=thumbv8m.base-eabi %s -o - \ -; RUN: | FileCheck %s --check-prefix=V8MBASE +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=arm-eabi | FileCheck %s --check-prefix=ARM +; RUN: llc < %s -mtriple=arm-eabi -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s --check-prefix=ARMT2 +; RUN: llc < %s -mtriple=thumb-eabi -mcpu=cortex-m0 | FileCheck %s --check-prefix=THUMB1 +; RUN: llc < %s -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 | FileCheck %s --check-prefix=THUMB2 +; RUN: llc < %s -mtriple=thumbv8m.base-eabi | FileCheck %s --check-prefix=V8MBASE define i32 @t1(i32 %c) nounwind readnone { -entry: ; ARM-LABEL: t1: -; ARM: mov [[R1:r[0-9]+]], #101 -; ARM: orr [[R1b:r[0-9]+]], [[R1]], #256 -; ARM: movgt {{r[0-1]}}, #123 - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: mov r1, #101 +; ARM-NEXT: cmp r0, #1 +; ARM-NEXT: orr r1, r1, #256 +; ARM-NEXT: movgt r1, #123 +; ARM-NEXT: mov r0, r1 +; ARM-NEXT: mov pc, lr +; ; ARMT2-LABEL: t1: -; ARMT2: movw [[R:r[0-1]]], #357 -; ARMT2: movwgt [[R]], #123 - +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: movw r1, #357 +; ARMT2-NEXT: cmp r0, #1 +; ARMT2-NEXT: movwgt r1, #123 +; ARMT2-NEXT: mov r0, r1 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t1: -; THUMB1: cmp r0, #1 -; THUMB1: bgt .LBB0_2 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: cmp r0, #1 +; THUMB1-NEXT: bgt .LBB0_2 +; THUMB1-NEXT: @ %bb.1: @ %entry +; THUMB1-NEXT: movs r0, #255 +; THUMB1-NEXT: adds r0, #102 +; THUMB1-NEXT: bx lr +; THUMB1-NEXT: .LBB0_2: +; THUMB1-NEXT: movs r0, #123 +; THUMB1-NEXT: bx lr +; ; THUMB2-LABEL: t1: -; THUMB2: movw [[R:r[0-1]]], #357 -; THUMB2: movgt [[R]], #123 - +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: movw r1, #357 +; THUMB2-NEXT: cmp r0, #1 +; THUMB2-NEXT: it gt +; THUMB2-NEXT: movgt r1, #123 +; THUMB2-NEXT: mov r0, r1 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t1: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: cmp r0, #1 +; V8MBASE-NEXT: bgt .LBB0_2 +; V8MBASE-NEXT: @ %bb.1: @ %entry +; V8MBASE-NEXT: movw r0, #357 +; V8MBASE-NEXT: bx lr +; V8MBASE-NEXT: .LBB0_2: +; V8MBASE-NEXT: movs r0, #123 +; V8MBASE-NEXT: bx lr +entry: %0 = icmp sgt i32 %c, 1 %1 = select i1 %0, i32 123, i32 357 ret i32 %1 } define i32 @t2(i32 %c) nounwind readnone { -entry: ; ARM-LABEL: t2: -; ARM: mov [[R:r[0-9]+]], #101 -; ARM: orr [[R]], [[R]], #256 -; ARM: movle [[R]], #123 - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: mov r1, #101 +; ARM-NEXT: cmp r0, #1 +; ARM-NEXT: orr r1, r1, #256 +; ARM-NEXT: movle r1, #123 +; ARM-NEXT: mov r0, r1 +; ARM-NEXT: mov pc, lr +; ; ARMT2-LABEL: t2: -; ARMT2: mov [[R:r[0-1]]], #123 -; ARMT2: movwgt [[R]], #357 - +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: mov r1, #123 +; ARMT2-NEXT: cmp r0, #1 +; ARMT2-NEXT: movwgt r1, #357 +; ARMT2-NEXT: mov r0, r1 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t2: -; THUMB1: cmp r{{[0-9]+}}, #1 -; THUMB1: bgt - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: cmp r0, #1 +; THUMB1-NEXT: bgt .LBB1_2 +; THUMB1-NEXT: @ %bb.1: @ %entry +; THUMB1-NEXT: movs r0, #123 +; THUMB1-NEXT: bx lr +; THUMB1-NEXT: .LBB1_2: +; THUMB1-NEXT: movs r0, #255 +; THUMB1-NEXT: adds r0, #102 +; THUMB1-NEXT: bx lr +; ; THUMB2-LABEL: t2: -; THUMB2: mov{{(s|\.w)}} [[R:r[0-1]]], #123 -; THUMB2: movwgt [[R]], #357 - +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: movs r1, #123 +; THUMB2-NEXT: cmp r0, #1 +; THUMB2-NEXT: it gt +; THUMB2-NEXT: movwgt r1, #357 +; THUMB2-NEXT: mov r0, r1 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t2: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: mov r1, r0 +; V8MBASE-NEXT: movw r0, #357 +; V8MBASE-NEXT: cmp r1, #1 +; V8MBASE-NEXT: bgt .LBB1_2 +; V8MBASE-NEXT: @ %bb.1: @ %entry +; V8MBASE-NEXT: movs r0, #123 +; V8MBASE-NEXT: .LBB1_2: @ %entry +; V8MBASE-NEXT: bx lr +entry: %0 = icmp sgt i32 %c, 1 %1 = select i1 %0, i32 357, i32 123 ret i32 %1 } define i32 @t3(i32 %a) nounwind readnone { -entry: ; ARM-LABEL: t3: -; ARM: rsbs r1, r0, #0 -; ARM: adc r0, r0, r1 - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: sub r0, r0, #160 +; ARM-NEXT: rsbs r1, r0, #0 +; ARM-NEXT: adc r0, r0, r1 +; ARM-NEXT: mov pc, lr +; ; ARMT2-LABEL: t3: -; ARMT2: clz r0, r0 -; ARMT2: lsr r0, r0, #5 - +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: sub r0, r0, #160 +; ARMT2-NEXT: clz r0, r0 +; ARMT2-NEXT: lsr r0, r0, #5 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t3: -; THUMB1: rsbs r1, r0, #0 -; THUMB1: adcs r0, r1 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: subs r0, #160 +; THUMB1-NEXT: rsbs r1, r0, #0 +; THUMB1-NEXT: adcs r0, r1 +; THUMB1-NEXT: bx lr +; ; THUMB2-LABEL: t3: -; THUMB2: clz r0, r0 -; THUMB2: lsrs r0, r0, #5 +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: subs r0, #160 +; THUMB2-NEXT: clz r0, r0 +; THUMB2-NEXT: lsrs r0, r0, #5 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t3: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: subs r0, #160 +; V8MBASE-NEXT: rsbs r1, r0, #0 +; V8MBASE-NEXT: adcs r0, r1 +; V8MBASE-NEXT: bx lr +entry: %0 = icmp eq i32 %a, 160 %1 = zext i1 %0 to i32 ret i32 %1 } define i32 @t4(i32 %a, i32 %b, i32 %x) nounwind { -entry: ; ARM-LABEL: t4: -; ARM: mvn [[R0:r[0-9]+]], #170 -; ARM: sub [[R0:r[0-9]+]], [[R0:r[0-9]+]], #11141120 -; ARM: mov{{lt|ge}} - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: mvn r3, #170 +; ARM-NEXT: cmp r0, r1 +; ARM-NEXT: sub r3, r3, #11141120 +; ARM-NEXT: movge r3, r2 +; ARM-NEXT: mov r0, r3 +; ARM-NEXT: mov pc, lr +; ; ARMT2-LABEL: t4: -; ARMT2: movwlt [[R0:r[0-9]+]], #65365 -; ARMT2: movtlt [[R0]], #65365 - +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: cmp r0, r1 +; ARMT2-NEXT: movwlt r2, #65365 +; ARMT2-NEXT: movtlt r2, #65365 +; ARMT2-NEXT: mov r0, r2 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t4: -; THUMB1: cmp r{{[0-9]+}}, r{{[0-9]+}} -; THUMB1: b{{lt|ge}} - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: cmp r0, r1 +; THUMB1-NEXT: bge .LBB3_2 +; THUMB1-NEXT: @ %bb.1: +; THUMB1-NEXT: ldr r2, .LCPI3_0 +; THUMB1-NEXT: .LBB3_2: @ %entry +; THUMB1-NEXT: mov r0, r2 +; THUMB1-NEXT: bx lr +; THUMB1-NEXT: .p2align 2 +; THUMB1-NEXT: @ %bb.3: +; THUMB1-NEXT: .LCPI3_0: +; THUMB1-NEXT: .long 4283826005 @ 0xff55ff55 +; ; THUMB2-LABEL: t4: -; THUMB2: mvnlt [[R0:r[0-9]+]], #11141290 +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: cmp r0, r1 +; THUMB2-NEXT: it lt +; THUMB2-NEXT: mvnlt r2, #11141290 +; THUMB2-NEXT: mov r0, r2 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t4: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: cmp r0, r1 +; V8MBASE-NEXT: bge .LBB3_2 +; V8MBASE-NEXT: @ %bb.1: +; V8MBASE-NEXT: movw r2, #65365 +; V8MBASE-NEXT: movt r2, #65365 +; V8MBASE-NEXT: .LBB3_2: @ %entry +; V8MBASE-NEXT: mov r0, r2 +; V8MBASE-NEXT: bx lr +entry: %0 = icmp slt i32 %a, %b %1 = select i1 %0, i32 4283826005, i32 %x ret i32 %1 @@ -106,105 +214,206 @@ entry: ; rdar://9758317 define i32 @t5(i32 %a) nounwind { -entry: ; ARM-LABEL: t5: -; ARM-NOT: mov -; ARM: sub r0, r0, #1 -; ARM-NOT: mov -; ARM: rsbs r1, r0, #0 -; ARM: adc r0, r0, r1 - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: sub r0, r0, #1 +; ARM-NEXT: rsbs r1, r0, #0 +; ARM-NEXT: adc r0, r0, r1 +; ARM-NEXT: mov pc, lr +; +; ARMT2-LABEL: t5: +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: sub r0, r0, #1 +; ARMT2-NEXT: clz r0, r0 +; ARMT2-NEXT: lsr r0, r0, #5 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t5: -; THUMB1-NOT: bne -; THUMB1: rsbs r0, r1, #0 -; THUMB1: adcs r0, r1 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: subs r1, r0, #1 +; THUMB1-NEXT: rsbs r0, r1, #0 +; THUMB1-NEXT: adcs r0, r1 +; THUMB1-NEXT: bx lr +; ; THUMB2-LABEL: t5: -; THUMB2-NOT: mov -; THUMB2: subs r0, #1 -; THUMB2: clz r0, r0 -; THUMB2: lsrs r0, r0, #5 - +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: subs r0, #1 +; THUMB2-NEXT: clz r0, r0 +; THUMB2-NEXT: lsrs r0, r0, #5 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t5: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: subs r1, r0, #1 +; V8MBASE-NEXT: rsbs r0, r1, #0 +; V8MBASE-NEXT: adcs r0, r1 +; V8MBASE-NEXT: bx lr +entry: %cmp = icmp eq i32 %a, 1 %conv = zext i1 %cmp to i32 ret i32 %conv } define i32 @t6(i32 %a) nounwind { -entry: ; ARM-LABEL: t6: -; ARM-NOT: mov -; ARM: cmp r0, #0 -; ARM: movne r0, #1 - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: cmp r0, #0 +; ARM-NEXT: movne r0, #1 +; ARM-NEXT: mov pc, lr +; +; ARMT2-LABEL: t6: +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: cmp r0, #0 +; ARMT2-NEXT: movwne r0, #1 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t6: -; THUMB1: subs r1, r0, #1 -; THUMB1: sbcs r0, r1 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: subs r1, r0, #1 +; THUMB1-NEXT: sbcs r0, r1 +; THUMB1-NEXT: bx lr +; ; THUMB2-LABEL: t6: -; THUMB2-NOT: mov -; THUMB2: cmp r0, #0 -; THUMB2: it ne -; THUMB2: movne r0, #1 +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: cmp r0, #0 +; THUMB2-NEXT: it ne +; THUMB2-NEXT: movne r0, #1 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t6: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: subs r1, r0, #1 +; V8MBASE-NEXT: sbcs r0, r1 +; V8MBASE-NEXT: bx lr +entry: %tobool = icmp ne i32 %a, 0 %lnot.ext = zext i1 %tobool to i32 ret i32 %lnot.ext } define i32 @t7(i32 %a, i32 %b) nounwind readnone { -entry: ; ARM-LABEL: t7: -; ARM: subs r0, r0, r1 -; ARM: movne r0, #1 -; ARM: lsl r0, r0, #2 - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: subs r0, r0, r1 +; ARM-NEXT: movne r0, #1 +; ARM-NEXT: lsl r0, r0, #2 +; ARM-NEXT: mov pc, lr +; ; ARMT2-LABEL: t7: -; ARMT2: subs r0, r0, r1 -; ARMT2: movwne r0, #1 -; ARMT2: lsl r0, r0, #2 - +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: subs r0, r0, r1 +; ARMT2-NEXT: movwne r0, #1 +; ARMT2-NEXT: lsl r0, r0, #2 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t7: -; THUMB1: subs r0, r0, r1 -; THUMB1: subs r1, r0, #1 -; THUMB1: sbcs r0, r1 -; THUMB1: lsls r0, r0, #2 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: subs r0, r0, r1 +; THUMB1-NEXT: subs r1, r0, #1 +; THUMB1-NEXT: sbcs r0, r1 +; THUMB1-NEXT: lsls r0, r0, #2 +; THUMB1-NEXT: bx lr +; ; THUMB2-LABEL: t7: -; THUMB2: subs r0, r0, r1 -; THUMB2: it ne -; THUMB2: movne r0, #1 -; THUMB2: lsls r0, r0, #2 +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: subs r0, r0, r1 +; THUMB2-NEXT: it ne +; THUMB2-NEXT: movne r0, #1 +; THUMB2-NEXT: lsls r0, r0, #2 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t7: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: subs r0, r0, r1 +; V8MBASE-NEXT: subs r1, r0, #1 +; V8MBASE-NEXT: sbcs r0, r1 +; V8MBASE-NEXT: lsls r0, r0, #2 +; V8MBASE-NEXT: bx lr +entry: %0 = icmp ne i32 %a, %b %1 = select i1 %0, i32 4, i32 0 ret i32 %1 } -define void @t8(i32 %a) { -entry: - ; ARM scheduler emits icmp/zext before both calls, so isn't relevant - +define void @t8(i32 %a) { +; ARM-LABEL: t8: +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: .save {r4, lr} +; ARM-NEXT: push {r4, lr} +; ARM-NEXT: mov r1, r0 +; ARM-NEXT: sub r0, r0, #5 +; ARM-NEXT: rsbs r2, r0, #0 +; ARM-NEXT: adc r4, r0, r2 +; ARM-NEXT: mov r0, #9 +; ARM-NEXT: bl t7 +; ARM-NEXT: mov r1, r0 +; ARM-NEXT: mov r0, r4 +; ARM-NEXT: pop {r4, lr} +; ARM-NEXT: b t7 +; ; ARMT2-LABEL: t8: -; ARMT2: bl t7 -; ARMT2: mov r1, r0 -; ARMT2: sub r0, r4, #5 -; ARMT2: clz r0, r0 -; ARMT2: lsr r0, r0, #5 - +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: .save {r4, lr} +; ARMT2-NEXT: push {r4, lr} +; ARMT2-NEXT: mov r4, r0 +; ARMT2-NEXT: mov r0, #9 +; ARMT2-NEXT: mov r1, r4 +; ARMT2-NEXT: bl t7 +; ARMT2-NEXT: mov r1, r0 +; ARMT2-NEXT: sub r0, r4, #5 +; ARMT2-NEXT: clz r0, r0 +; ARMT2-NEXT: lsr r0, r0, #5 +; ARMT2-NEXT: pop {r4, lr} +; ARMT2-NEXT: b t7 +; ; THUMB1-LABEL: t8: -; THUMB1: bl t7 -; THUMB1: mov r1, r0 -; THUMB1: subs r2, r4, #5 -; THUMB1: rsbs r0, r2, #0 -; THUMB1: adcs r0, r2 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: mov r4, r0 +; THUMB1-NEXT: movs r0, #9 +; THUMB1-NEXT: mov r1, r4 +; THUMB1-NEXT: bl t7 +; THUMB1-NEXT: mov r1, r0 +; THUMB1-NEXT: subs r2, r4, #5 +; THUMB1-NEXT: rsbs r0, r2, #0 +; THUMB1-NEXT: adcs r0, r2 +; THUMB1-NEXT: bl t7 +; THUMB1-NEXT: pop {r4, pc} +; ; THUMB2-LABEL: t8: -; THUMB2: bl t7 -; THUMB2: mov r1, r0 -; THUMB2: subs r0, r4, #5 -; THUMB2: clz r0, r0 -; THUMB2: lsrs r0, r0, #5 - +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, lr} +; THUMB2-NEXT: push {r4, lr} +; THUMB2-NEXT: mov r4, r0 +; THUMB2-NEXT: movs r0, #9 +; THUMB2-NEXT: mov r1, r4 +; THUMB2-NEXT: bl t7 +; THUMB2-NEXT: mov r1, r0 +; THUMB2-NEXT: subs r0, r4, #5 +; THUMB2-NEXT: clz r0, r0 +; THUMB2-NEXT: lsrs r0, r0, #5 +; THUMB2-NEXT: pop.w {r4, lr} +; THUMB2-NEXT: b t7 +; +; V8MBASE-LABEL: t8: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: .save {r4, lr} +; V8MBASE-NEXT: push {r4, lr} +; V8MBASE-NEXT: mov r1, r0 +; V8MBASE-NEXT: subs r0, r0, #5 +; V8MBASE-NEXT: rsbs r4, r0, #0 +; V8MBASE-NEXT: adcs r4, r0 +; V8MBASE-NEXT: movs r0, #9 +; V8MBASE-NEXT: bl t7 +; V8MBASE-NEXT: mov r1, r0 +; V8MBASE-NEXT: mov r0, r4 +; V8MBASE-NEXT: pop {r4} +; V8MBASE-NEXT: pop {r2} +; V8MBASE-NEXT: mov lr, r2 +; V8MBASE-NEXT: b t7 +entry: %cmp = icmp eq i32 %a, 5 %conv = zext i1 %cmp to i32 %call = tail call i32 @t7(i32 9, i32 %a) @@ -212,44 +421,127 @@ entry: ret void } -define void @t9(i8* %a, i8 %b) { -entry: - ; ARM scheduler emits icmp/zext before both calls, so isn't relevant - +define void @t9(i8* %a, i8 %b) { +; ARM-LABEL: t9: +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: .save {r4, lr} +; ARM-NEXT: push {r4, lr} +; ARM-NEXT: ldrsb r4, [r0] +; ARM-NEXT: mov r0, #1 +; ARM-NEXT: bl f +; ARM-NEXT: and r0, r4, #255 +; ARM-NEXT: cmp r0, r0 +; ARM-NEXT: bne .LBB8_3 +; ARM-NEXT: @ %bb.1: @ %while.body.preheader +; ARM-NEXT: add r1, r4, #1 +; ARM-NEXT: mov r2, r0 +; ARM-NEXT: .LBB8_2: @ %while.body +; ARM-NEXT: @ =>This Inner Loop Header: Depth=1 +; ARM-NEXT: add r2, r2, #1 +; ARM-NEXT: add r1, r1, #1 +; ARM-NEXT: and r3, r2, #255 +; ARM-NEXT: cmp r3, r0 +; ARM-NEXT: blt .LBB8_2 +; ARM-NEXT: .LBB8_3: @ %while.end +; ARM-NEXT: pop {r4, lr} +; ARM-NEXT: mov pc, lr +; ; ARMT2-LABEL: t9: -; ARMT2: bl f -; ARMT2: uxtb r0, r4 -; ARMT2: cmp r0, r0 -; ARMT2: add r1, r4, #1 -; ARMT2: mov r2, r0 -; ARMT2: add r2, r2, #1 -; ARMT2: add r1, r1, #1 -; ARMT2: uxtb r3, r2 -; ARMT2: cmp r3, r0 - +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: .save {r4, lr} +; ARMT2-NEXT: push {r4, lr} +; ARMT2-NEXT: ldrsb r4, [r0] +; ARMT2-NEXT: mov r0, #1 +; ARMT2-NEXT: bl f +; ARMT2-NEXT: uxtb r0, r4 +; ARMT2-NEXT: cmp r0, r0 +; ARMT2-NEXT: popne {r4, pc} +; ARMT2-NEXT: .LBB8_1: @ %while.body.preheader +; ARMT2-NEXT: add r1, r4, #1 +; ARMT2-NEXT: mov r2, r0 +; ARMT2-NEXT: .LBB8_2: @ %while.body +; ARMT2-NEXT: @ =>This Inner Loop Header: Depth=1 +; ARMT2-NEXT: add r2, r2, #1 +; ARMT2-NEXT: add r1, r1, #1 +; ARMT2-NEXT: uxtb r3, r2 +; ARMT2-NEXT: cmp r3, r0 +; ARMT2-NEXT: blt .LBB8_2 +; ARMT2-NEXT: @ %bb.3: @ %while.end +; ARMT2-NEXT: pop {r4, pc} +; ; THUMB1-LABEL: t9: -; THUMB1: bl f -; THUMB1: uxtb r0, r4 -; THUMB1: cmp r0, r0 -; THUMB1: adds r1, r4, #1 -; THUMB1: mov r2, r0 -; THUMB1: adds r1, r1, #1 -; THUMB1: adds r2, r2, #1 -; THUMB1: uxtb r3, r2 -; THUMB1: cmp r3, r0 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, lr} +; THUMB1-NEXT: push {r4, lr} +; THUMB1-NEXT: movs r1, #0 +; THUMB1-NEXT: ldrsb r4, [r0, r1] +; THUMB1-NEXT: movs r0, #1 +; THUMB1-NEXT: bl f +; THUMB1-NEXT: uxtb r0, r4 +; THUMB1-NEXT: cmp r0, r0 +; THUMB1-NEXT: bne .LBB8_3 +; THUMB1-NEXT: @ %bb.1: @ %while.body.preheader +; THUMB1-NEXT: adds r1, r4, #1 +; THUMB1-NEXT: mov r2, r0 +; THUMB1-NEXT: .LBB8_2: @ %while.body +; THUMB1-NEXT: @ =>This Inner Loop Header: Depth=1 +; THUMB1-NEXT: adds r1, r1, #1 +; THUMB1-NEXT: adds r2, r2, #1 +; THUMB1-NEXT: uxtb r3, r2 +; THUMB1-NEXT: cmp r3, r0 +; THUMB1-NEXT: blt .LBB8_2 +; THUMB1-NEXT: .LBB8_3: @ %while.end +; THUMB1-NEXT: pop {r4, pc} +; ; THUMB2-LABEL: t9: -; THUMB2: bl f -; THUMB2: uxtb r0, r4 -; THUMB2: cmp r0, r0 -; THUMB2: adds r1, r4, #1 -; THUMB2: mov r2, r0 -; THUMB2: adds r2, #1 -; THUMB2: adds r1, #1 -; THUMB2: uxtb r3, r2 -; THUMB2: cmp r3, r0 - +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r4, lr} +; THUMB2-NEXT: push {r4, lr} +; THUMB2-NEXT: ldrsb.w r4, [r0] +; THUMB2-NEXT: movs r0, #1 +; THUMB2-NEXT: bl f +; THUMB2-NEXT: uxtb r0, r4 +; THUMB2-NEXT: cmp r0, r0 +; THUMB2-NEXT: it ne +; THUMB2-NEXT: popne {r4, pc} +; THUMB2-NEXT: .LBB8_1: @ %while.body.preheader +; THUMB2-NEXT: adds r1, r4, #1 +; THUMB2-NEXT: mov r2, r0 +; THUMB2-NEXT: .LBB8_2: @ %while.body +; THUMB2-NEXT: @ =>This Inner Loop Header: Depth=1 +; THUMB2-NEXT: adds r2, #1 +; THUMB2-NEXT: adds r1, #1 +; THUMB2-NEXT: uxtb r3, r2 +; THUMB2-NEXT: cmp r3, r0 +; THUMB2-NEXT: blt .LBB8_2 +; THUMB2-NEXT: @ %bb.3: @ %while.end +; THUMB2-NEXT: pop {r4, pc} +; +; V8MBASE-LABEL: t9: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: .save {r4, lr} +; V8MBASE-NEXT: push {r4, lr} +; V8MBASE-NEXT: movs r1, #0 +; V8MBASE-NEXT: ldrsb r4, [r0, r1] +; V8MBASE-NEXT: movs r0, #1 +; V8MBASE-NEXT: bl f +; V8MBASE-NEXT: uxtb r0, r4 +; V8MBASE-NEXT: cmp r0, r0 +; V8MBASE-NEXT: bne .LBB8_3 +; V8MBASE-NEXT: @ %bb.1: @ %while.body.preheader +; V8MBASE-NEXT: adds r1, r4, #1 +; V8MBASE-NEXT: mov r2, r0 +; V8MBASE-NEXT: .LBB8_2: @ %while.body +; V8MBASE-NEXT: @ =>This Inner Loop Header: Depth=1 +; V8MBASE-NEXT: adds r1, r1, #1 +; V8MBASE-NEXT: adds r2, r2, #1 +; V8MBASE-NEXT: uxtb r3, r2 +; V8MBASE-NEXT: cmp r3, r0 +; V8MBASE-NEXT: blt .LBB8_2 +; V8MBASE-NEXT: .LBB8_3: @ %while.end +; V8MBASE-NEXT: pop {r4, pc} +entry: %0 = load i8, i8* %a %conv = sext i8 %0 to i32 %conv119 = zext i8 %0 to i32 @@ -274,8 +566,104 @@ while.end: declare void @f(i1 zeroext) - define i1 @t10() { +; ARM-LABEL: t10: +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: .save {r11, lr} +; ARM-NEXT: push {r11, lr} +; ARM-NEXT: .pad #8 +; ARM-NEXT: sub sp, sp, #8 +; ARM-NEXT: mvn r0, #2 +; ARM-NEXT: mvn r1, #7 +; ARM-NEXT: str r0, [sp, #4] +; ARM-NEXT: mvn r0, #7 +; ARM-NEXT: str r0, [sp] +; ARM-NEXT: mvn r0, #2 +; ARM-NEXT: bl __aeabi_idivmod +; ARM-NEXT: sub r0, r1, r0, lsl #3 +; ARM-NEXT: add r0, r0, #3 +; ARM-NEXT: rsbs r1, r0, #0 +; ARM-NEXT: adc r0, r0, r1 +; ARM-NEXT: add sp, sp, #8 +; ARM-NEXT: pop {r11, lr} +; ARM-NEXT: mov pc, lr +; +; ARMT2-LABEL: t10: +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: .save {r11, lr} +; ARMT2-NEXT: push {r11, lr} +; ARMT2-NEXT: .pad #8 +; ARMT2-NEXT: sub sp, sp, #8 +; ARMT2-NEXT: mvn r0, #2 +; ARMT2-NEXT: str r0, [sp, #4] +; ARMT2-NEXT: mvn r0, #7 +; ARMT2-NEXT: str r0, [sp] +; ARMT2-NEXT: mvn r0, #2 +; ARMT2-NEXT: mvn r1, #7 +; ARMT2-NEXT: bl __aeabi_idivmod +; ARMT2-NEXT: sub r0, r1, r0, lsl #3 +; ARMT2-NEXT: add r0, r0, #3 +; ARMT2-NEXT: clz r0, r0 +; ARMT2-NEXT: lsr r0, r0, #5 +; ARMT2-NEXT: add sp, sp, #8 +; ARMT2-NEXT: pop {r11, pc} +; +; THUMB1-LABEL: t10: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .save {r4, r5, r7, lr} +; THUMB1-NEXT: push {r4, r5, r7, lr} +; THUMB1-NEXT: .pad #8 +; THUMB1-NEXT: sub sp, #8 +; THUMB1-NEXT: movs r0, #7 +; THUMB1-NEXT: mvns r4, r0 +; THUMB1-NEXT: str r4, [sp] +; THUMB1-NEXT: adds r5, r4, #5 +; THUMB1-NEXT: str r5, [sp, #4] +; THUMB1-NEXT: mov r0, r5 +; THUMB1-NEXT: mov r1, r4 +; THUMB1-NEXT: bl __aeabi_idivmod +; THUMB1-NEXT: muls r0, r4, r0 +; THUMB1-NEXT: adds r0, r0, r1 +; THUMB1-NEXT: subs r1, r0, r5 +; THUMB1-NEXT: rsbs r0, r1, #0 +; THUMB1-NEXT: adcs r0, r1 +; THUMB1-NEXT: add sp, #8 +; THUMB1-NEXT: pop {r4, r5, r7, pc} +; +; THUMB2-LABEL: t10: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .save {r7, lr} +; THUMB2-NEXT: push {r7, lr} +; THUMB2-NEXT: .pad #8 +; THUMB2-NEXT: sub sp, #8 +; THUMB2-NEXT: mvn r0, #2 +; THUMB2-NEXT: str r0, [sp, #4] +; THUMB2-NEXT: mvn r0, #7 +; THUMB2-NEXT: str r0, [sp] +; THUMB2-NEXT: mvn r0, #2 +; THUMB2-NEXT: mvn r1, #7 +; THUMB2-NEXT: bl __aeabi_idivmod +; THUMB2-NEXT: sub.w r0, r1, r0, lsl #3 +; THUMB2-NEXT: adds r0, #3 +; THUMB2-NEXT: clz r0, r0 +; THUMB2-NEXT: lsrs r0, r0, #5 +; THUMB2-NEXT: add sp, #8 +; THUMB2-NEXT: pop {r7, pc} +; +; V8MBASE-LABEL: t10: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: .pad #8 +; V8MBASE-NEXT: sub sp, #8 +; V8MBASE-NEXT: movs r0, #7 +; V8MBASE-NEXT: mvns r0, r0 +; V8MBASE-NEXT: str r0, [sp] +; V8MBASE-NEXT: adds r0, r0, #5 +; V8MBASE-NEXT: str r0, [sp, #4] +; V8MBASE-NEXT: movs r1, #0 +; V8MBASE-NEXT: rsbs r0, r1, #0 +; V8MBASE-NEXT: adcs r0, r1 +; V8MBASE-NEXT: add sp, #8 +; V8MBASE-NEXT: bx lr entry: %q = alloca i32 %p = alloca i32 @@ -289,29 +677,134 @@ entry: %add = add nsw i32 %mul, %rem %cmp = icmp eq i32 %add, %0 ret i1 %cmp - -; ARM-LABEL: t10: -; ARM: rsbs r1, r0, #0 -; ARM: adc r0, r0, r1 - -; ARMT2-LABEL: t10: -; ARMT2: clz r0, r0 -; ARMT2: lsr r0, r0, #5 - -; THUMB1-LABEL: t10: -; THUMB1: rsbs r0, r1, #0 -; THUMB1: adcs r0, r1 - -; THUMB2-LABEL: t10: -; THUMB2: clz r0, r0 -; THUMB2: lsrs r0, r0, #5 - -; V8MBASE-LABEL: t10: -; V8MBASE-NOT: movs r0, #0 -; V8MBASE: movs r0, #7 } define i1 @t11() { +; ARM-LABEL: t11: +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: .pad #4 +; ARM-NEXT: sub sp, sp, #4 +; ARM-NEXT: ldr r0, .LCPI10_0 +; ARM-NEXT: mov r1, #33 +; ARM-NEXT: umull r2, r3, r1, r0 +; ARM-NEXT: lsr r0, r3, #3 +; ARM-NEXT: add r0, r0, r0, lsl #2 +; ARM-NEXT: sub r0, r1, r0, lsl #1 +; ARM-NEXT: ldr r1, [sp] +; ARM-NEXT: and r1, r1, #-33554432 +; ARM-NEXT: orr r0, r1, r0 +; ARM-NEXT: mov r1, #255 +; ARM-NEXT: orr r0, r0, #40960 +; ARM-NEXT: orr r1, r1, #3840 +; ARM-NEXT: str r0, [sp] +; ARM-NEXT: and r0, r0, r1 +; ARM-NEXT: sub r0, r0, #3 +; ARM-NEXT: rsbs r1, r0, #0 +; ARM-NEXT: adc r0, r0, r1 +; ARM-NEXT: add sp, sp, #4 +; ARM-NEXT: mov pc, lr +; ARM-NEXT: .p2align 2 +; ARM-NEXT: @ %bb.1: +; ARM-NEXT: .LCPI10_0: +; ARM-NEXT: .long 3435973837 @ 0xcccccccd +; +; ARMT2-LABEL: t11: +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: .pad #4 +; ARMT2-NEXT: sub sp, sp, #4 +; ARMT2-NEXT: ldr r1, [sp] +; ARMT2-NEXT: mov r0, #33 +; ARMT2-NEXT: movw r2, #52429 +; ARMT2-NEXT: movt r2, #52428 +; ARMT2-NEXT: bfi r1, r0, #0, #12 +; ARMT2-NEXT: mov r0, #10 +; ARMT2-NEXT: bfi r1, r0, #12, #13 +; ARMT2-NEXT: mov r0, r1 +; ARMT2-NEXT: bfc r0, #12, #20 +; ARMT2-NEXT: umull r2, r3, r0, r2 +; ARMT2-NEXT: lsr r2, r3, #3 +; ARMT2-NEXT: add r2, r2, r2, lsl #2 +; ARMT2-NEXT: sub r0, r0, r2, lsl #1 +; ARMT2-NEXT: movw r2, #40960 +; ARMT2-NEXT: movt r2, #65024 +; ARMT2-NEXT: and r1, r1, r2 +; ARMT2-NEXT: orr r0, r1, r0 +; ARMT2-NEXT: str r0, [sp] +; ARMT2-NEXT: bfc r0, #12, #20 +; ARMT2-NEXT: sub r0, r0, #3 +; ARMT2-NEXT: clz r0, r0 +; ARMT2-NEXT: lsr r0, r0, #5 +; ARMT2-NEXT: add sp, sp, #4 +; ARMT2-NEXT: bx lr +; +; THUMB1-LABEL: t11: +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: .pad #4 +; THUMB1-NEXT: sub sp, #4 +; THUMB1-NEXT: movs r0, #5 +; THUMB1-NEXT: lsls r0, r0, #13 +; THUMB1-NEXT: ldr r1, [sp] +; THUMB1-NEXT: orrs r1, r0 +; THUMB1-NEXT: ldr r0, .LCPI10_0 +; THUMB1-NEXT: ands r0, r1 +; THUMB1-NEXT: adds r0, r0, #3 +; THUMB1-NEXT: str r0, [sp] +; THUMB1-NEXT: movs r1, #0 +; THUMB1-NEXT: rsbs r0, r1, #0 +; THUMB1-NEXT: adcs r0, r1 +; THUMB1-NEXT: add sp, #4 +; THUMB1-NEXT: bx lr +; THUMB1-NEXT: .p2align 2 +; THUMB1-NEXT: @ %bb.1: +; THUMB1-NEXT: .LCPI10_0: +; THUMB1-NEXT: .long 4261453824 @ 0xfe00a000 +; +; THUMB2-LABEL: t11: +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: .pad #4 +; THUMB2-NEXT: sub sp, #4 +; THUMB2-NEXT: ldr r1, [sp] +; THUMB2-NEXT: movs r0, #33 +; THUMB2-NEXT: movw r2, #52429 +; THUMB2-NEXT: bfi r1, r0, #0, #12 +; THUMB2-NEXT: movs r0, #10 +; THUMB2-NEXT: bfi r1, r0, #12, #13 +; THUMB2-NEXT: mov r0, r1 +; THUMB2-NEXT: movt r2, #52428 +; THUMB2-NEXT: bfc r0, #12, #20 +; THUMB2-NEXT: umull r2, r3, r0, r2 +; THUMB2-NEXT: lsrs r2, r3, #3 +; THUMB2-NEXT: add.w r2, r2, r2, lsl #2 +; THUMB2-NEXT: sub.w r0, r0, r2, lsl #1 +; THUMB2-NEXT: movw r2, #40960 +; THUMB2-NEXT: movt r2, #65024 +; THUMB2-NEXT: ands r1, r2 +; THUMB2-NEXT: orrs r0, r1 +; THUMB2-NEXT: str r0, [sp] +; THUMB2-NEXT: bfc r0, #12, #20 +; THUMB2-NEXT: subs r0, #3 +; THUMB2-NEXT: clz r0, r0 +; THUMB2-NEXT: lsrs r0, r0, #5 +; THUMB2-NEXT: add sp, #4 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t11: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: .pad #4 +; V8MBASE-NEXT: sub sp, #4 +; V8MBASE-NEXT: movw r0, #40960 +; V8MBASE-NEXT: ldr r1, [sp] +; V8MBASE-NEXT: orrs r1, r0 +; V8MBASE-NEXT: movw r0, #40960 +; V8MBASE-NEXT: movt r0, #65024 +; V8MBASE-NEXT: ands r0, r1 +; V8MBASE-NEXT: adds r0, r0, #3 +; V8MBASE-NEXT: str r0, [sp] +; V8MBASE-NEXT: movs r1, #0 +; V8MBASE-NEXT: rsbs r0, r1, #0 +; V8MBASE-NEXT: adcs r0, r1 +; V8MBASE-NEXT: add sp, #4 +; V8MBASE-NEXT: bx lr entry: %bit = alloca i32 %load = load i32, i32* %bit @@ -329,67 +822,87 @@ entry: %clear12 = and i32 %set10, 4095 %cmp = icmp eq i32 %clear12, 3 ret i1 %cmp - -; ARM-LABEL: t11: -; ARM: rsbs r1, r0, #0 -; ARM: adc r0, r0, r1 - -; ARMT2-LABEL: t11: -; ARMT2: clz r0, r0 -; ARMT2: lsr r0, r0, #5 - -; THUMB1-LABEL: t11: -; THUMB1-NOT: movs r0, #0 -; THUMB1: movs r0, #5 - -; THUMB2-LABEL: t11: -; THUMB2: clz r0, r0 -; THUMB2: lsrs r0, r0, #5 - -; V8MBASE-LABEL: t11: -; V8MBASE-NOT: movs r0, #0 -; V8MBASE: movw r0, #40960 } define i32 @t12(i32 %a) nounwind { -entry: ; ARM-LABEL: t12: -; ARM-NOT: mov -; ARM: cmp r0, #0 -; ARM: movne r0, #1 - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: cmp r0, #0 +; ARM-NEXT: movne r0, #1 +; ARM-NEXT: lsl r0, r0, #1 +; ARM-NEXT: mov pc, lr +; +; ARMT2-LABEL: t12: +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: cmp r0, #0 +; ARMT2-NEXT: movwne r0, #1 +; ARMT2-NEXT: lsl r0, r0, #1 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t12: -; THUMB1: subs r1, r0, #1 -; THUMB1: sbcs r0, r1 -; THUMB1: lsls r0, r0, #1 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: subs r1, r0, #1 +; THUMB1-NEXT: sbcs r0, r1 +; THUMB1-NEXT: lsls r0, r0, #1 +; THUMB1-NEXT: bx lr +; ; THUMB2-LABEL: t12: -; THUMB2-NOT: mov -; THUMB2: cmp r0, #0 -; THUMB2: it ne -; THUMB2: movne r0, #1 +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: cmp r0, #0 +; THUMB2-NEXT: it ne +; THUMB2-NEXT: movne r0, #1 +; THUMB2-NEXT: lsls r0, r0, #1 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t12: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: subs r1, r0, #1 +; V8MBASE-NEXT: sbcs r0, r1 +; V8MBASE-NEXT: lsls r0, r0, #1 +; V8MBASE-NEXT: bx lr +entry: %tobool = icmp ne i32 %a, 0 %lnot.ext = select i1 %tobool, i32 2, i32 0 ret i32 %lnot.ext } define i32 @t13(i32 %a) nounwind { -entry: ; ARM-LABEL: t13: -; ARM-NOT: mov -; ARM: cmp r0, #0 -; ARM: movne r0, #3 - +; ARM: @ %bb.0: @ %entry +; ARM-NEXT: cmp r0, #0 +; ARM-NEXT: movne r0, #3 +; ARM-NEXT: mov pc, lr +; +; ARMT2-LABEL: t13: +; ARMT2: @ %bb.0: @ %entry +; ARMT2-NEXT: cmp r0, #0 +; ARMT2-NEXT: movwne r0, #3 +; ARMT2-NEXT: bx lr +; ; THUMB1-LABEL: t13: -; THUMB1: cmp r0, #0 -; THUMB1: beq -; THUMB1: movs r0, #3 - +; THUMB1: @ %bb.0: @ %entry +; THUMB1-NEXT: cmp r0, #0 +; THUMB1-NEXT: beq .LBB12_2 +; THUMB1-NEXT: @ %bb.1: +; THUMB1-NEXT: movs r0, #3 +; THUMB1-NEXT: .LBB12_2: @ %entry +; THUMB1-NEXT: bx lr +; ; THUMB2-LABEL: t13: -; THUMB2-NOT: mov -; THUMB2: cmp r0, #0 -; THUMB2: it ne -; THUMB2: movne r0, #3 +; THUMB2: @ %bb.0: @ %entry +; THUMB2-NEXT: cmp r0, #0 +; THUMB2-NEXT: it ne +; THUMB2-NEXT: movne r0, #3 +; THUMB2-NEXT: bx lr +; +; V8MBASE-LABEL: t13: +; V8MBASE: @ %bb.0: @ %entry +; V8MBASE-NEXT: cbz r0, .LBB12_2 +; V8MBASE-NEXT: @ %bb.1: +; V8MBASE-NEXT: movs r0, #3 +; V8MBASE-NEXT: .LBB12_2: @ %entry +; V8MBASE-NEXT: bx lr +entry: %tobool = icmp ne i32 %a, 0 %lnot.ext = select i1 %tobool, i32 3, i32 0 ret i32 %lnot.ext -- GitLab From 388fbefb4f2aec19da221ebbc3e091919f7520a9 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Mar 2021 11:15:44 +0000 Subject: [PATCH 0020/1000] [AMDGPU] Regenerate atomic_optimizations_global_pointer.ll tests --- .../atomic_optimizations_global_pointer.ll | 2241 ++++++++++++++++- 1 file changed, 2106 insertions(+), 135 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index a1280e1f9791..aba4f7d80aa9 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,65 +1,594 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN64 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN32 %s declare i32 @llvm.amdgcn.workitem.id.x() ; Show what the atomic optimization pass will do for global pointers. -; GCN-LABEL: add_i32_constant: -; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo -; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] -; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} -; GCN: s_mul_i32 s[[value:[0-9]+]], s[[popcount]], 5 -; GCN: v_mov_b32_e32 v[[data:[0-9]+]], s[[value]] -; GCN: {{flat|buffer|global}}_atomic_add v[[data]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { +; GFX7LESS-LABEL: add_i32_constant: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz BB0_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s9, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: BB0_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX89-LABEL: add_i32_constant: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b64 s[6:7], exec +; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX89-NEXT: ; implicit-def: $vgpr1 +; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX89-NEXT: s_cbranch_execz BB0_2 +; GFX89-NEXT: ; %bb.1: +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX89-NEXT: s_mul_i32 s2, s2, 5 +; GFX89-NEXT: s_mov_b32 s11, 0xf000 +; GFX89-NEXT: s_mov_b32 s10, -1 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: v_mov_b32_e32 v1, s2 +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_wbinvl1_vol +; GFX89-NEXT: BB0_2: +; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX89-NEXT: v_readfirstlane_b32 s4, v1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX89-NEXT: s_endpgm +; +; GCN64-LABEL: add_i32_constant: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN64-NEXT: s_mov_b64 s[6:7], exec +; GCN64-NEXT: ; implicit-def: $vgpr1 +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN64-NEXT: s_cbranch_execz BB0_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GCN64-NEXT: s_mov_b32 s11, 0x31016000 +; GCN64-NEXT: s_mul_i32 s6, s6, 5 +; GCN64-NEXT: s_mov_b32 s10, -1 +; GCN64-NEXT: v_mov_b32_e32 v1, s6 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mov_b32 s8, s2 +; GCN64-NEXT: s_mov_b32 s9, s3 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB0_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_readfirstlane_b32 s2, v1 +; GCN64-NEXT: s_mov_b32 s3, 0x31016000 +; GCN64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GCN64-NEXT: s_mov_b32 s2, -1 +; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: add_i32_constant: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN32-NEXT: s_mov_b32 s5, exec_lo +; GCN32-NEXT: ; implicit-def: $vgpr1 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB0_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 +; GCN32-NEXT: s_mov_b32 s11, 0x31016000 +; GCN32-NEXT: s_mul_i32 s5, s5, 5 +; GCN32-NEXT: s_mov_b32 s10, -1 +; GCN32-NEXT: v_mov_b32_e32 v1, s5 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mov_b32 s8, s2 +; GCN32-NEXT: s_mov_b32 s9, s3 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB0_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_readfirstlane_b32 s2, v1 +; GCN32-NEXT: s_mov_b32 s3, 0x31016000 +; GCN32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GCN32-NEXT: s_mov_b32 s2, -1 +; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN32-NEXT: s_endpgm entry: %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel store i32 %old, i32 addrspace(1)* %out ret void } -; GCN-LABEL: add_i32_uniform: -; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo -; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] -; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} -; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] -; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] -; GCN: {{flat|buffer|global}}_atomic_add v[[value]] define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %additive) { +; GFX7LESS-LABEL: add_i32_uniform: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX7LESS-NEXT: s_cbranch_execz BB1_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: BB1_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: add_i32_uniform: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_cbranch_execz BB1_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mov_b32 s15, 0xf000 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s12, s6 +; GFX8-NEXT: s_mov_b32 s13, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: BB1_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_i32_uniform: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz BB1_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_atomic_add v1, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: BB1_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_add_u32_e32 v0, s0, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GCN64-LABEL: add_i32_uniform: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: s_clause 0x1 +; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN64-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN64-NEXT: s_mov_b64 s[8:9], exec +; GCN64-NEXT: ; implicit-def: $vgpr1 +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN64-NEXT: s_cbranch_execz BB1_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GCN64-NEXT: s_mov_b32 s11, 0x31016000 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mul_i32 s3, s2, s3 +; GCN64-NEXT: s_mov_b32 s10, -1 +; GCN64-NEXT: v_mov_b32_e32 v1, s3 +; GCN64-NEXT: s_mov_b32 s8, s6 +; GCN64-NEXT: s_mov_b32 s9, s7 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB1_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN64-NEXT: v_readfirstlane_b32 s0, v1 +; GCN64-NEXT: s_mov_b32 s7, 0x31016000 +; GCN64-NEXT: s_mov_b32 s6, -1 +; GCN64-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GCN64-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: add_i32_uniform: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: s_clause 0x1 +; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN32-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN32-NEXT: s_mov_b32 s3, exec_lo +; GCN32-NEXT: ; implicit-def: $vgpr1 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB1_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: s_bcnt1_i32_b32 s1, s3 +; GCN32-NEXT: s_mov_b32 s11, 0x31016000 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mul_i32 s1, s2, s1 +; GCN32-NEXT: s_mov_b32 s10, -1 +; GCN32-NEXT: v_mov_b32_e32 v1, s1 +; GCN32-NEXT: s_mov_b32 s8, s6 +; GCN32-NEXT: s_mov_b32 s9, s7 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB1_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN32-NEXT: v_readfirstlane_b32 s0, v1 +; GCN32-NEXT: s_mov_b32 s7, 0x31016000 +; GCN32-NEXT: s_mov_b32 s6, -1 +; GCN32-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GCN32-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN32-NEXT: s_endpgm entry: %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel store i32 %old, i32 addrspace(1)* %out ret void } -; GCN-LABEL: add_i32_varying: -; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 -; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 -; GFX7LESS-NOT: s_bcnt1_i32_b64 -; GFX7LESS: buffer_atomic_add v{{[0-9]+}} -; DPPCOMB: v_add_u32_dpp -; DPPCOMB: v_add_u32_dpp -; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 -; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] -; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] -; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] -; GFX8MORE: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { +; GFX7LESS-LABEL: add_i32_varying: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: s_mov_b32 s10, s6 +; GFX7LESS-NEXT: s_mov_b32 s11, s7 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s9, s3 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_mov_b32 s4, s0 +; GFX7LESS-NEXT: s_mov_b32 s5, s1 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: add_i32_varying: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz BB2_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: BB2_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_i32_varying: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz BB2_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: BB2_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GCN64-LABEL: add_i32_varying: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: v_mov_b32_e32 v1, v0 +; GCN64-NEXT: s_not_b64 exec, exec +; GCN64-NEXT: v_mov_b32_e32 v1, 0 +; GCN64-NEXT: s_not_b64 exec, exec +; GCN64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN64-NEXT: v_mov_b32_e32 v3, 0 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN64-NEXT: v_mov_b32_e32 v2, v1 +; GCN64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GCN64-NEXT: v_readlane_b32 s4, v1, 31 +; GCN64-NEXT: v_mov_b32_e32 v2, s4 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GCN64-NEXT: v_readlane_b32 s6, v1, 15 +; GCN64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GCN64-NEXT: s_mov_b64 exec, s[2:3] +; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN64-NEXT: v_readlane_b32 s7, v1, 31 +; GCN64-NEXT: v_writelane_b32 v3, s6, 16 +; GCN64-NEXT: s_mov_b64 exec, s[4:5] +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN64-NEXT: v_readlane_b32 s8, v1, 47 +; GCN64-NEXT: v_readlane_b32 s9, v1, 63 +; GCN64-NEXT: v_writelane_b32 v3, s7, 32 +; GCN64-NEXT: s_mov_b64 exec, s[4:5] +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GCN64-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN64-NEXT: s_mov_b32 s4, s9 +; GCN64-NEXT: v_writelane_b32 v3, s8, 48 +; GCN64-NEXT: s_mov_b64 exec, s[6:7] +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_mov_b32 s6, -1 +; GCN64-NEXT: ; implicit-def: $vgpr0 +; GCN64-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN64-NEXT: s_cbranch_execz BB2_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: v_mov_b32_e32 v0, s4 +; GCN64-NEXT: s_mov_b32 s7, 0x31016000 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mov_b32 s4, s2 +; GCN64-NEXT: s_mov_b32 s5, s3 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB2_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_readfirstlane_b32 s2, v0 +; GCN64-NEXT: v_mov_b32_e32 v0, v3 +; GCN64-NEXT: s_mov_b32 s3, 0x31016000 +; GCN64-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GCN64-NEXT: s_mov_b32 s2, s6 +; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: add_i32_varying: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: v_mov_b32_e32 v1, v0 +; GCN32-NEXT: s_not_b32 exec_lo, exec_lo +; GCN32-NEXT: v_mov_b32_e32 v1, 0 +; GCN32-NEXT: s_not_b32 exec_lo, exec_lo +; GCN32-NEXT: s_or_saveexec_b32 s2, -1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN32-NEXT: v_mov_b32_e32 v2, v1 +; GCN32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GCN32-NEXT: s_mov_b32 exec_lo, s2 +; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN32-NEXT: s_or_saveexec_b32 s4, -1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GCN32-NEXT: v_mov_b32_e32 v3, 0 +; GCN32-NEXT: v_readlane_b32 s5, v1, 15 +; GCN32-NEXT: v_readlane_b32 s6, v1, 31 +; GCN32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GCN32-NEXT: s_mov_b32 exec_lo, s4 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN32-NEXT: s_or_saveexec_b32 s4, -1 +; GCN32-NEXT: v_writelane_b32 v3, s5, 16 +; GCN32-NEXT: s_mov_b32 exec_lo, s4 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_mov_b32 s4, s6 +; GCN32-NEXT: s_mov_b32 s6, -1 +; GCN32-NEXT: ; implicit-def: $vgpr0 +; GCN32-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB2_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: v_mov_b32_e32 v0, s4 +; GCN32-NEXT: s_mov_b32 s7, 0x31016000 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mov_b32 s4, s2 +; GCN32-NEXT: s_mov_b32 s5, s3 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB2_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_readfirstlane_b32 s2, v0 +; GCN32-NEXT: v_mov_b32_e32 v0, v3 +; GCN32-NEXT: s_mov_b32 s3, 0x31016000 +; GCN32-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GCN32-NEXT: s_mov_b32 s2, s6 +; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel @@ -67,49 +596,464 @@ entry: ret void } -; GCN-LABEL: add_i64_constant: -; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo -; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] -; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} -; GCN-DAG: s_mul_i32 s[[value:[0-9]+]], s[[popcount]], 5 -; GCN-DAG: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 -; GCN: v_mov_b32_e32 v[[value_lo:[0-9]+]], s[[value]] -; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { +; GFX7LESS-LABEL: add_i64_constant: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz BB3_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s9, s3 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s3, s2, 5 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: BB3_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX89-LABEL: add_i64_constant: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b64 s[6:7], exec +; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX89-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX89-NEXT: s_cbranch_execz BB3_2 +; GFX89-NEXT: ; %bb.1: +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 +; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX89-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 +; GFX89-NEXT: s_mul_i32 s2, s2, 5 +; GFX89-NEXT: s_mov_b32 s11, 0xf000 +; GFX89-NEXT: s_mov_b32 s10, -1 +; GFX89-NEXT: s_mov_b32 s9, s3 +; GFX89-NEXT: v_mov_b32_e32 v1, s2 +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX89-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_wbinvl1_vol +; GFX89-NEXT: BB3_2: +; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_readfirstlane_b32 s2, v1 +; GFX89-NEXT: v_readfirstlane_b32 s3, v2 +; GFX89-NEXT: v_mov_b32_e32 v1, s2 +; GFX89-NEXT: v_mov_b32_e32 v2, s3 +; GFX89-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: s_nop 2 +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX89-NEXT: s_endpgm +; +; GCN64-LABEL: add_i64_constant: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN64-NEXT: s_mov_b64 s[6:7], exec +; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN64-NEXT: s_cbranch_execz BB3_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GCN64-NEXT: s_mov_b32 s11, 0x31016000 +; GCN64-NEXT: s_mul_i32 s7, s6, 5 +; GCN64-NEXT: v_mul_hi_u32_u24_e64 v2, s6, 5 +; GCN64-NEXT: v_mov_b32_e32 v1, s7 +; GCN64-NEXT: s_mov_b32 s10, -1 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mov_b32 s8, s2 +; GCN64-NEXT: s_mov_b32 s9, s3 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB3_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_readfirstlane_b32 s2, v1 +; GCN64-NEXT: v_readfirstlane_b32 s3, v2 +; GCN64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] +; GCN64-NEXT: s_mov_b32 s3, 0x31016000 +; GCN64-NEXT: s_mov_b32 s2, -1 +; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: add_i64_constant: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN32-NEXT: s_mov_b32 s5, exec_lo +; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB3_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 +; GCN32-NEXT: s_mov_b32 s11, 0x31016000 +; GCN32-NEXT: s_mul_i32 s6, s5, 5 +; GCN32-NEXT: v_mul_hi_u32_u24_e64 v2, s5, 5 +; GCN32-NEXT: v_mov_b32_e32 v1, s6 +; GCN32-NEXT: s_mov_b32 s10, -1 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mov_b32 s8, s2 +; GCN32-NEXT: s_mov_b32 s9, s3 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB3_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_readfirstlane_b32 s2, v1 +; GCN32-NEXT: v_readfirstlane_b32 s3, v2 +; GCN32-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] +; GCN32-NEXT: s_mov_b32 s3, 0x31016000 +; GCN32-NEXT: s_mov_b32 s2, -1 +; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN32-NEXT: s_endpgm entry: %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel store i64 %old, i64 addrspace(1)* %out ret void } -; GCN-LABEL: add_i64_uniform: -; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo -; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] -; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} -; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { +; GFX7LESS-LABEL: add_i64_uniform: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz BB4_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: BB4_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX7LESS-NEXT: v_mul_hi_u32 v2, s0, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 +; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: add_i64_uniform: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz BB4_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s12, s6 +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX8-NEXT: s_mov_b32 s13, s7 +; GFX8-NEXT: s_mul_i32 s7, s1, s6 +; GFX8-NEXT: s_mul_i32 s6, s0, s6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 +; GFX8-NEXT: s_mov_b32 s15, 0xf000 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: BB4_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_i64_uniform: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz BB4_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: BB4_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GCN64-LABEL: add_i64_uniform: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: s_clause 0x1 +; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN64-NEXT: s_mov_b64 s[8:9], exec +; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN64-NEXT: s_cbranch_execz BB4_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: s_bcnt1_i32_b64 s8, s[8:9] +; GCN64-NEXT: s_mov_b32 s11, 0x31016000 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mul_i32 s9, s3, s8 +; GCN64-NEXT: s_mul_hi_u32 s10, s2, s8 +; GCN64-NEXT: s_mul_i32 s8, s2, s8 +; GCN64-NEXT: s_add_i32 s10, s10, s9 +; GCN64-NEXT: v_mov_b32_e32 v1, s8 +; GCN64-NEXT: v_mov_b32_e32 v2, s10 +; GCN64-NEXT: s_mov_b32 s10, -1 +; GCN64-NEXT: s_mov_b32 s8, s6 +; GCN64-NEXT: s_mov_b32 s9, s7 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB4_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_mul_lo_u32 v3, s3, v0 +; GCN64-NEXT: v_mul_hi_u32 v4, s2, v0 +; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN64-NEXT: v_readfirstlane_b32 s0, v1 +; GCN64-NEXT: v_readfirstlane_b32 s1, v2 +; GCN64-NEXT: s_mov_b32 s7, 0x31016000 +; GCN64-NEXT: s_mov_b32 s6, -1 +; GCN64-NEXT: v_add_nc_u32_e32 v1, v4, v3 +; GCN64-NEXT: v_add_co_u32_e64 v0, vcc, s0, v0 +; GCN64-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc +; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: add_i64_uniform: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: s_clause 0x1 +; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN32-NEXT: s_mov_b32 s8, exec_lo +; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB4_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: s_bcnt1_i32_b32 s1, s8 +; GCN32-NEXT: s_mov_b32 s11, 0x31016000 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mul_i32 s8, s3, s1 +; GCN32-NEXT: s_mul_hi_u32 s9, s2, s1 +; GCN32-NEXT: s_mul_i32 s1, s2, s1 +; GCN32-NEXT: s_add_i32 s9, s9, s8 +; GCN32-NEXT: v_mov_b32_e32 v1, s1 +; GCN32-NEXT: v_mov_b32_e32 v2, s9 +; GCN32-NEXT: s_mov_b32 s10, -1 +; GCN32-NEXT: s_mov_b32 s8, s6 +; GCN32-NEXT: s_mov_b32 s9, s7 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB4_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_mul_lo_u32 v3, s3, v0 +; GCN32-NEXT: v_mul_hi_u32 v4, s2, v0 +; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN32-NEXT: v_readfirstlane_b32 s0, v1 +; GCN32-NEXT: v_readfirstlane_b32 s1, v2 +; GCN32-NEXT: s_mov_b32 s7, 0x31016000 +; GCN32-NEXT: s_mov_b32 s6, -1 +; GCN32-NEXT: v_add_nc_u32_e32 v1, v4, v3 +; GCN32-NEXT: v_add_co_u32_e64 v0, vcc_lo, s0, v0 +; GCN32-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN32-NEXT: s_endpgm entry: %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel store i64 %old, i64 addrspace(1)* %out ret void } -; GCN-LABEL: add_i64_varying: -; GCN-NOT: v_mbcnt_lo_u32_b32 -; GCN-NOT: v_mbcnt_hi_u32_b32 -; GCN-NOT: s_bcnt1_i32_b64 -; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { +; GFX7LESS-LABEL: add_i64_varying: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: s_mov_b32 s10, s6 +; GFX7LESS-NEXT: s_mov_b32 s11, s7 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s9, s3 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_mov_b32 s4, s0 +; GFX7LESS-NEXT: s_mov_b32 s5, s1 +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX89-LABEL: add_i64_varying: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: v_mov_b32_e32 v1, 0 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 +; GFX89-NEXT: s_mov_b32 s4, s6 +; GFX89-NEXT: s_mov_b32 s5, s7 +; GFX89-NEXT: s_mov_b32 s6, s2 +; GFX89-NEXT: s_mov_b32 s7, s3 +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_wbinvl1_vol +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX89-NEXT: s_endpgm +; +; GFX10-LABEL: add_i64_varying: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -118,58 +1062,624 @@ entry: ret void } -; GCN-LABEL: sub_i32_constant: -; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo -; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] -; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} -; GCN: s_mul_i32 s[[value:[0-9]+]], s[[popcount]], 5 -; GCN: v_mov_b32_e32 v[[data:[0-9]+]], s[[value]] -; GCN: {{flat|buffer|global}}_atomic_sub v[[data]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { +; GFX7LESS-LABEL: sub_i32_constant: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz BB6_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s9, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: BB6_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: sub_i32_constant: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz BB6_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: BB6_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sub_i32_constant: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz BB6_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: BB6_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GCN64-LABEL: sub_i32_constant: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN64-NEXT: s_mov_b64 s[6:7], exec +; GCN64-NEXT: ; implicit-def: $vgpr1 +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN64-NEXT: s_cbranch_execz BB6_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GCN64-NEXT: s_mov_b32 s11, 0x31016000 +; GCN64-NEXT: s_mul_i32 s6, s6, 5 +; GCN64-NEXT: s_mov_b32 s10, -1 +; GCN64-NEXT: v_mov_b32_e32 v1, s6 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mov_b32 s8, s2 +; GCN64-NEXT: s_mov_b32 s9, s3 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB6_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_readfirstlane_b32 s2, v1 +; GCN64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GCN64-NEXT: s_mov_b32 s3, 0x31016000 +; GCN64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GCN64-NEXT: s_mov_b32 s2, -1 +; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: sub_i32_constant: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN32-NEXT: s_mov_b32 s5, exec_lo +; GCN32-NEXT: ; implicit-def: $vgpr1 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB6_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 +; GCN32-NEXT: s_mov_b32 s11, 0x31016000 +; GCN32-NEXT: s_mul_i32 s5, s5, 5 +; GCN32-NEXT: s_mov_b32 s10, -1 +; GCN32-NEXT: v_mov_b32_e32 v1, s5 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mov_b32 s8, s2 +; GCN32-NEXT: s_mov_b32 s9, s3 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB6_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_readfirstlane_b32 s2, v1 +; GCN32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GCN32-NEXT: s_mov_b32 s3, 0x31016000 +; GCN32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GCN32-NEXT: s_mov_b32 s2, -1 +; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN32-NEXT: s_endpgm entry: %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel store i32 %old, i32 addrspace(1)* %out ret void } -; GCN-LABEL: sub_i32_uniform: -; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo -; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] -; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} -; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] -; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] -; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace(1)* %inout, i32 %subitive) { +; GFX7LESS-LABEL: sub_i32_uniform: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dword s0, s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX7LESS-NEXT: s_cbranch_execz BB7_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mul_i32 s1, s0, s1 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: BB7_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: sub_i32_uniform: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34 +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr1 +; GFX8-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX8-NEXT: s_cbranch_execz BB7_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mul_i32 s1, s0, s1 +; GFX8-NEXT: s_mov_b32 s15, 0xf000 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: s_mov_b32 s12, s6 +; GFX8-NEXT: s_mov_b32 s13, s7 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: BB7_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s0, v1 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sub_i32_uniform: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz BB7_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mul_i32 s3, s2, s3 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_atomic_sub v1, off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: BB7_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_sub_u32_e32 v0, s0, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GCN64-LABEL: sub_i32_uniform: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: s_clause 0x1 +; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN64-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN64-NEXT: s_mov_b64 s[8:9], exec +; GCN64-NEXT: ; implicit-def: $vgpr1 +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN64-NEXT: s_cbranch_execz BB7_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GCN64-NEXT: s_mov_b32 s11, 0x31016000 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mul_i32 s3, s2, s3 +; GCN64-NEXT: s_mov_b32 s10, -1 +; GCN64-NEXT: v_mov_b32_e32 v1, s3 +; GCN64-NEXT: s_mov_b32 s8, s6 +; GCN64-NEXT: s_mov_b32 s9, s7 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB7_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN64-NEXT: v_readfirstlane_b32 s0, v1 +; GCN64-NEXT: s_mov_b32 s7, 0x31016000 +; GCN64-NEXT: s_mov_b32 s6, -1 +; GCN64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GCN64-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: sub_i32_uniform: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: s_clause 0x1 +; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN32-NEXT: s_load_dword s2, s[0:1], 0x34 +; GCN32-NEXT: s_mov_b32 s3, exec_lo +; GCN32-NEXT: ; implicit-def: $vgpr1 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB7_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: s_bcnt1_i32_b32 s1, s3 +; GCN32-NEXT: s_mov_b32 s11, 0x31016000 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mul_i32 s1, s2, s1 +; GCN32-NEXT: s_mov_b32 s10, -1 +; GCN32-NEXT: v_mov_b32_e32 v1, s1 +; GCN32-NEXT: s_mov_b32 s8, s6 +; GCN32-NEXT: s_mov_b32 s9, s7 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB7_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN32-NEXT: v_readfirstlane_b32 s0, v1 +; GCN32-NEXT: s_mov_b32 s7, 0x31016000 +; GCN32-NEXT: s_mov_b32 s6, -1 +; GCN32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GCN32-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN32-NEXT: s_endpgm entry: %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel store i32 %old, i32 addrspace(1)* %out ret void } -; GCN-LABEL: sub_i32_varying: -; GFX7LESS-NOT: v_mbcnt_lo_u32_b32 -; GFX7LESS-NOT: v_mbcnt_hi_u32_b32 -; GFX7LESS-NOT: s_bcnt1_i32_b64 -; GFX7LESS: buffer_atomic_sub v{{[0-9]+}} -; DPPCOMB: v_add_u32_dpp -; DPPCOMB: v_add_u32_dpp -; GFX8MORE32: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 31 -; GFX8MORE64: v_readlane_b32 s[[scalar_value:[0-9]+]], v{{[0-9]+}}, 63 -; GFX89: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] -; GFX10: s_mov_b32 s[[copy_value:[0-9]+]], s[[scalar_value]] -; GFX10: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[copy_value]] -; GFX8MORE: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { +; GFX7LESS-LABEL: sub_i32_varying: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: s_mov_b32 s10, s6 +; GFX7LESS-NEXT: s_mov_b32 s11, s7 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s9, s3 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_mov_b32 s4, s0 +; GFX7LESS-NEXT: s_mov_b32 s5, s1 +; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: sub_i32_varying: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: v_mov_b32_e32 v2, v0 +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s6, v2, 63 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz BB8_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: BB8_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s4, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sub_i32_varying: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s6, v2, 63 +; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz BB8_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: BB8_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GCN64-LABEL: sub_i32_varying: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: v_mov_b32_e32 v1, v0 +; GCN64-NEXT: s_not_b64 exec, exec +; GCN64-NEXT: v_mov_b32_e32 v1, 0 +; GCN64-NEXT: s_not_b64 exec, exec +; GCN64-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN64-NEXT: v_mov_b32_e32 v3, 0 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN64-NEXT: v_mov_b32_e32 v2, v1 +; GCN64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GCN64-NEXT: v_readlane_b32 s4, v1, 31 +; GCN64-NEXT: v_mov_b32_e32 v2, s4 +; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GCN64-NEXT: v_readlane_b32 s6, v1, 15 +; GCN64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GCN64-NEXT: s_mov_b64 exec, s[2:3] +; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN64-NEXT: v_readlane_b32 s7, v1, 31 +; GCN64-NEXT: v_writelane_b32 v3, s6, 16 +; GCN64-NEXT: s_mov_b64 exec, s[4:5] +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN64-NEXT: v_readlane_b32 s8, v1, 47 +; GCN64-NEXT: v_readlane_b32 s9, v1, 63 +; GCN64-NEXT: v_writelane_b32 v3, s7, 32 +; GCN64-NEXT: s_mov_b64 exec, s[4:5] +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GCN64-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GCN64-NEXT: s_mov_b32 s4, s9 +; GCN64-NEXT: v_writelane_b32 v3, s8, 48 +; GCN64-NEXT: s_mov_b64 exec, s[6:7] +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_mov_b32 s6, -1 +; GCN64-NEXT: ; implicit-def: $vgpr0 +; GCN64-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GCN64-NEXT: s_cbranch_execz BB8_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: v_mov_b32_e32 v0, s4 +; GCN64-NEXT: s_mov_b32 s7, 0x31016000 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mov_b32 s4, s2 +; GCN64-NEXT: s_mov_b32 s5, s3 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB8_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_readfirstlane_b32 s2, v0 +; GCN64-NEXT: v_mov_b32_e32 v0, v3 +; GCN64-NEXT: s_mov_b32 s3, 0x31016000 +; GCN64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GCN64-NEXT: s_mov_b32 s2, s6 +; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: sub_i32_varying: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: v_mov_b32_e32 v1, v0 +; GCN32-NEXT: s_not_b32 exec_lo, exec_lo +; GCN32-NEXT: v_mov_b32_e32 v1, 0 +; GCN32-NEXT: s_not_b32 exec_lo, exec_lo +; GCN32-NEXT: s_or_saveexec_b32 s2, -1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GCN32-NEXT: v_mov_b32_e32 v2, v1 +; GCN32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GCN32-NEXT: s_mov_b32 exec_lo, s2 +; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN32-NEXT: s_or_saveexec_b32 s4, -1 +; GCN32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GCN32-NEXT: v_mov_b32_e32 v3, 0 +; GCN32-NEXT: v_readlane_b32 s5, v1, 15 +; GCN32-NEXT: v_readlane_b32 s6, v1, 31 +; GCN32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GCN32-NEXT: s_mov_b32 exec_lo, s4 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN32-NEXT: s_or_saveexec_b32 s4, -1 +; GCN32-NEXT: v_writelane_b32 v3, s5, 16 +; GCN32-NEXT: s_mov_b32 exec_lo, s4 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_mov_b32 s4, s6 +; GCN32-NEXT: s_mov_b32 s6, -1 +; GCN32-NEXT: ; implicit-def: $vgpr0 +; GCN32-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB8_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: v_mov_b32_e32 v0, s4 +; GCN32-NEXT: s_mov_b32 s7, 0x31016000 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mov_b32 s4, s2 +; GCN32-NEXT: s_mov_b32 s5, s3 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB8_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_readfirstlane_b32 s2, v0 +; GCN32-NEXT: v_mov_b32_e32 v0, v3 +; GCN32-NEXT: s_mov_b32 s3, 0x31016000 +; GCN32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GCN32-NEXT: s_mov_b32 s2, s6 +; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN32-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel @@ -177,49 +1687,510 @@ entry: ret void } -; GCN-LABEL: sub_i64_constant: -; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo -; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] -; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} -; GCN-DAG: s_mul_i32 s[[value:[0-9]+]], s[[popcount]], 5 -; GCN-DAG: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 -; GCN: v_mov_b32_e32 v[[value_lo:[0-9]+]], s[[value]] -; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { +; GFX7LESS-LABEL: sub_i64_constant: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz BB9_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s9, s3 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s3, s2, 5 +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: BB9_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: sub_i64_constant: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz BB9_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s9, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: BB9_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s5, v2 +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s5 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sub_i64_constant: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s7, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz BB9_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s9, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: BB9_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: v_readfirstlane_b32 s5, v2 +; GFX9-NEXT: v_readfirstlane_b32 s4, v1 +; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s5 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX9-NEXT: s_endpgm +; +; GCN64-LABEL: sub_i64_constant: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN64-NEXT: s_mov_b64 s[6:7], exec +; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN64-NEXT: s_cbranch_execz BB9_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GCN64-NEXT: s_mov_b32 s11, 0x31016000 +; GCN64-NEXT: s_mul_i32 s7, s6, 5 +; GCN64-NEXT: v_mul_hi_u32_u24_e64 v2, s6, 5 +; GCN64-NEXT: v_mov_b32_e32 v1, s7 +; GCN64-NEXT: s_mov_b32 s10, -1 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mov_b32 s8, s2 +; GCN64-NEXT: s_mov_b32 s9, s3 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB9_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_readfirstlane_b32 s2, v1 +; GCN64-NEXT: v_mul_u32_u24_e32 v1, 5, v0 +; GCN64-NEXT: v_readfirstlane_b32 s3, v2 +; GCN64-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 +; GCN64-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 +; GCN64-NEXT: s_mov_b32 s2, -1 +; GCN64-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GCN64-NEXT: s_mov_b32 s3, 0x31016000 +; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: sub_i64_constant: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN32-NEXT: s_mov_b32 s5, exec_lo +; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB9_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 +; GCN32-NEXT: s_mov_b32 s11, 0x31016000 +; GCN32-NEXT: s_mul_i32 s6, s5, 5 +; GCN32-NEXT: v_mul_hi_u32_u24_e64 v2, s5, 5 +; GCN32-NEXT: v_mov_b32_e32 v1, s6 +; GCN32-NEXT: s_mov_b32 s10, -1 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mov_b32 s8, s2 +; GCN32-NEXT: s_mov_b32 s9, s3 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB9_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_readfirstlane_b32 s2, v1 +; GCN32-NEXT: v_mul_u32_u24_e32 v1, 5, v0 +; GCN32-NEXT: v_readfirstlane_b32 s3, v2 +; GCN32-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 +; GCN32-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 +; GCN32-NEXT: s_mov_b32 s2, -1 +; GCN32-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GCN32-NEXT: s_mov_b32 s3, 0x31016000 +; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN32-NEXT: s_endpgm entry: %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel store i64 %old, i64 addrspace(1)* %out ret void } -; GCN-LABEL: sub_i64_uniform: -; GCN32: s_mov_b32 s[[exec_lo:[0-9]+]], exec_lo -; GCN64: s_mov_b64 s{{\[}}[[exec_lo:[0-9]+]]:[[exec_hi:[0-9]+]]{{\]}}, exec -; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], s[[exec_lo]], 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], s[[exec_hi]], v[[mbcnt]] -; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] -; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s[[exec_lo]] -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s{{\[}}[[exec_lo]]:[[exec_hi]]{{\]}} -; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { +; GFX7LESS-LABEL: sub_i64_uniform: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec +; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s9, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz BB10_2 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s14, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s12, s6 +; GFX7LESS-NEXT: s_mov_b32 s13, s7 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 +; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: BB10_2: +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX7LESS-NEXT: v_mul_hi_u32 v2, s0, v0 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v2, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s3 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 +; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: sub_i64_uniform: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz BB10_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s12, s6 +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mul_hi_u32 v1, s0, v1 +; GFX8-NEXT: s_mov_b32 s13, s7 +; GFX8-NEXT: s_mul_i32 s7, s1, s6 +; GFX8-NEXT: s_mul_i32 s6, s0, s6 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 +; GFX8-NEXT: s_mov_b32 s15, 0xf000 +; GFX8-NEXT: s_mov_b32 s14, -1 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: BB10_2: +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 +; GFX8-NEXT: v_mul_hi_u32 v3, s0, v0 +; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 +; GFX8-NEXT: v_readfirstlane_b32 s1, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: s_mov_b32 s7, 0xf000 +; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sub_i64_uniform: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX9-NEXT: s_cbranch_execz BB10_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s12, s6 +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX9-NEXT: s_mov_b32 s13, s7 +; GFX9-NEXT: s_mul_i32 s7, s3, s6 +; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 +; GFX9-NEXT: s_add_i32 s8, s8, s7 +; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: s_mov_b32 s15, 0xf000 +; GFX9-NEXT: s_mov_b32 s14, -1 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: BB10_2: +; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 +; GFX9-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s0, v1 +; GFX9-NEXT: v_readfirstlane_b32 s1, v2 +; GFX9-NEXT: v_add_u32_e32 v1, v4, v3 +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: s_mov_b32 s7, 0xf000 +; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_endpgm +; +; GCN64-LABEL: sub_i64_uniform: +; GCN64: ; %bb.0: ; %entry +; GCN64-NEXT: s_clause 0x1 +; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN64-NEXT: s_mov_b64 s[8:9], exec +; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 +; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GCN64-NEXT: s_cbranch_execz BB10_2 +; GCN64-NEXT: ; %bb.1: +; GCN64-NEXT: s_bcnt1_i32_b64 s8, s[8:9] +; GCN64-NEXT: s_mov_b32 s11, 0x31016000 +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: s_mul_i32 s9, s3, s8 +; GCN64-NEXT: s_mul_hi_u32 s10, s2, s8 +; GCN64-NEXT: s_mul_i32 s8, s2, s8 +; GCN64-NEXT: s_add_i32 s10, s10, s9 +; GCN64-NEXT: v_mov_b32_e32 v1, s8 +; GCN64-NEXT: v_mov_b32_e32 v2, s10 +; GCN64-NEXT: s_mov_b32 s10, -1 +; GCN64-NEXT: s_mov_b32 s8, s6 +; GCN64-NEXT: s_mov_b32 s9, s7 +; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN64-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GCN64-NEXT: s_waitcnt vmcnt(0) +; GCN64-NEXT: buffer_gl0_inv +; GCN64-NEXT: buffer_gl1_inv +; GCN64-NEXT: BB10_2: +; GCN64-NEXT: s_waitcnt_depctr 0xffe3 +; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN64-NEXT: s_waitcnt lgkmcnt(0) +; GCN64-NEXT: v_mul_lo_u32 v3, s3, v0 +; GCN64-NEXT: v_mul_hi_u32 v4, s2, v0 +; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN64-NEXT: v_readfirstlane_b32 s0, v1 +; GCN64-NEXT: v_readfirstlane_b32 s1, v2 +; GCN64-NEXT: s_mov_b32 s7, 0x31016000 +; GCN64-NEXT: s_mov_b32 s6, -1 +; GCN64-NEXT: v_add_nc_u32_e32 v1, v4, v3 +; GCN64-NEXT: v_sub_co_u32_e64 v0, vcc, s0, v0 +; GCN64-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc +; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN64-NEXT: s_endpgm +; +; GCN32-LABEL: sub_i64_uniform: +; GCN32: ; %bb.0: ; %entry +; GCN32-NEXT: s_clause 0x1 +; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GCN32-NEXT: s_mov_b32 s8, exec_lo +; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GCN32-NEXT: s_cbranch_execz BB10_2 +; GCN32-NEXT: ; %bb.1: +; GCN32-NEXT: s_bcnt1_i32_b32 s1, s8 +; GCN32-NEXT: s_mov_b32 s11, 0x31016000 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: s_mul_i32 s8, s3, s1 +; GCN32-NEXT: s_mul_hi_u32 s9, s2, s1 +; GCN32-NEXT: s_mul_i32 s1, s2, s1 +; GCN32-NEXT: s_add_i32 s9, s9, s8 +; GCN32-NEXT: v_mov_b32_e32 v1, s1 +; GCN32-NEXT: v_mov_b32_e32 v2, s9 +; GCN32-NEXT: s_mov_b32 s10, -1 +; GCN32-NEXT: s_mov_b32 s8, s6 +; GCN32-NEXT: s_mov_b32 s9, s7 +; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 +; GCN32-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GCN32-NEXT: s_waitcnt vmcnt(0) +; GCN32-NEXT: buffer_gl0_inv +; GCN32-NEXT: buffer_gl1_inv +; GCN32-NEXT: BB10_2: +; GCN32-NEXT: s_waitcnt_depctr 0xffe3 +; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GCN32-NEXT: s_waitcnt lgkmcnt(0) +; GCN32-NEXT: v_mul_lo_u32 v3, s3, v0 +; GCN32-NEXT: v_mul_hi_u32 v4, s2, v0 +; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 +; GCN32-NEXT: v_readfirstlane_b32 s0, v1 +; GCN32-NEXT: v_readfirstlane_b32 s1, v2 +; GCN32-NEXT: s_mov_b32 s7, 0x31016000 +; GCN32-NEXT: s_mov_b32 s6, -1 +; GCN32-NEXT: v_add_nc_u32_e32 v1, v4, v3 +; GCN32-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s0, v0 +; GCN32-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN32-NEXT: s_endpgm entry: %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel store i64 %old, i64 addrspace(1)* %out ret void } -; GCN-LABEL: sub_i64_varying: -; GCN-NOT: v_mbcnt_lo_u32_b32 -; GCN-NOT: v_mbcnt_hi_u32_b32 -; GCN-NOT: s_bcnt1_i32_b64 -; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { +; GFX7LESS-LABEL: sub_i64_varying: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: s_mov_b32 s10, s6 +; GFX7LESS-NEXT: s_mov_b32 s11, s7 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mov_b32 s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s9, s3 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_mov_b32 s4, s0 +; GFX7LESS-NEXT: s_mov_b32 s5, s1 +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS-NEXT: s_endpgm +; +; GFX89-LABEL: sub_i64_varying: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX89-NEXT: s_mov_b32 s3, 0xf000 +; GFX89-NEXT: s_mov_b32 s2, -1 +; GFX89-NEXT: v_mov_b32_e32 v1, 0 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s0, s4 +; GFX89-NEXT: s_mov_b32 s1, s5 +; GFX89-NEXT: s_mov_b32 s4, s6 +; GFX89-NEXT: s_mov_b32 s5, s7 +; GFX89-NEXT: s_mov_b32 s6, s2 +; GFX89-NEXT: s_mov_b32 s7, s3 +; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX89-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 glc +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: buffer_wbinvl1_vol +; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX89-NEXT: s_endpgm +; +; GFX10-LABEL: sub_i64_varying: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: s_mov_b32 s7, 0x31016000 +; GFX10-NEXT: s_mov_b32 s6, -1 +; GFX10-NEXT: s_mov_b32 s11, s7 +; GFX10-NEXT: s_mov_b32 s10, s6 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_mov_b32 s8, s2 +; GFX10-NEXT: s_mov_b32 s9, s3 +; GFX10-NEXT: s_mov_b32 s4, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_mov_b32 s5, s1 +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX10-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 -- GitLab From d8b8f544d9de30cd14584094596090d3f9992345 Mon Sep 17 00:00:00 2001 From: Elizabeth Andrews Date: Thu, 18 Mar 2021 02:58:35 -0700 Subject: [PATCH 0021/1000] [Reland] "Do not apply calling conventions to MSVC entry points" This patch is a second attempt at fixing a link error for MSVC entry points when calling conventions are specified using a flag. Calling conventions specified using flags should not be applied to MSVC entry points. The default calling convention is set in this case. The default calling convention for MSVC entry points main and wmain is cdecl. For WinMain, wWinMain and DllMain, the default calling convention is stdcall on 32 bit Windows. Explicitly specified calling conventions are applied to MSVC entry points. For MinGW, the default calling convention for all MSVC entry points is cdecl. First attempt: 4cff1b40dacf6 Revert of first attempt: bebfc3b92d5e8 Differential Revision: https://reviews.llvm.org/D97941 --- clang/lib/Sema/SemaDecl.cpp | 34 ++++++++++++++ .../test/CodeGenCXX/default_calling_conv.cpp | 45 +++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index b962bd965223..76e3ee965777 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -11173,6 +11173,25 @@ void Sema::CheckMain(FunctionDecl* FD, const DeclSpec& DS) { } } +static bool isDefaultStdCall(FunctionDecl *FD, Sema &S) { + + // Default calling convention for main and wmain is __cdecl + if (FD->getName() == "main" || FD->getName() == "wmain") + return false; + + // Default calling convention for MinGW is __cdecl + const llvm::Triple &T = S.Context.getTargetInfo().getTriple(); + if (T.isWindowsGNUEnvironment()) + return false; + + // Default calling convention for WinMain, wWinMain and DllMain + // is __stdcall on 32 bit Windows + if (T.isOSWindows() && T.getArch() == llvm::Triple::x86) + return true; + + return false; +} + void Sema::CheckMSVCRTEntryPoint(FunctionDecl *FD) { QualType T = FD->getType(); assert(T->isFunctionType() && "function decl is not of function type"); @@ -11187,6 +11206,21 @@ void Sema::CheckMSVCRTEntryPoint(FunctionDecl *FD) { if (FD->getName() != "DllMain") FD->setHasImplicitReturnZero(true); + // Explicity specified calling conventions are applied to MSVC entry points + if (!hasExplicitCallingConv(T)) { + if (isDefaultStdCall(FD, *this)) { + if (FT->getCallConv() != CC_X86StdCall) { + FT = Context.adjustFunctionType( + FT, FT->getExtInfo().withCallingConv(CC_X86StdCall)); + FD->setType(QualType(FT, 0)); + } + } else if (FT->getCallConv() != CC_C) { + FT = Context.adjustFunctionType(FT, + FT->getExtInfo().withCallingConv(CC_C)); + FD->setType(QualType(FT, 0)); + } + } + if (!FD->isInvalidDecl() && FD->getDescribedFunctionTemplate()) { Diag(FD->getLocation(), diag::err_mainlike_template_decl) << FD; FD->setInvalidDecl(); diff --git a/clang/test/CodeGenCXX/default_calling_conv.cpp b/clang/test/CodeGenCXX/default_calling_conv.cpp index e3d7ac429a60..83d1200e0ab1 100644 --- a/clang/test/CodeGenCXX/default_calling_conv.cpp +++ b/clang/test/CodeGenCXX/default_calling_conv.cpp @@ -4,6 +4,9 @@ // RUN: %clang_cc1 -triple i486-unknown-linux-gnu -mrtd -emit-llvm -o - %s | FileCheck %s --check-prefix=STDCALL --check-prefix=ALL // RUN: %clang_cc1 -triple i986-unknown-linux-gnu -fdefault-calling-conv=vectorcall -emit-llvm -o - %s | FileCheck %s --check-prefix=VECTORCALL --check-prefix=ALL // RUN: %clang_cc1 -triple i986-unknown-linux-gnu -fdefault-calling-conv=regcall -emit-llvm -o - %s | FileCheck %s --check-prefix=REGCALL --check-prefix=ALL +// RUN: %clang_cc1 -triple i686-pc-win32 -fdefault-calling-conv=vectorcall -emit-llvm -o - %s -DWINDOWS | FileCheck %s --check-prefix=WIN32 +// RUN: %clang_cc1 -triple x86_64-windows-msvc -fdefault-calling-conv=vectorcall -emit-llvm -o - %s -DWINDOWS | FileCheck %s --check-prefix=WIN64 +// RUN: %clang_cc1 -triple i686-pc-win32 -emit-llvm -o - %s -DEXPLICITCC | FileCheck %s --check-prefix=EXPLICITCC // CDECL: define{{.*}} void @_Z5test1v // FASTCALL: define{{.*}} x86_fastcallcc void @_Z5test1v @@ -50,3 +53,45 @@ void test() { int main() { return 1; } + +#ifdef WINDOWS +// WIN32: define dso_local i32 @wmain +// WIN64: define dso_local i32 @wmain +int wmain() { + return 1; +} +// WIN32: define dso_local x86_stdcallcc i32 @WinMain +// WIN64: define dso_local i32 @WinMain +int WinMain() { + return 1; +} +// WIN32: define dso_local x86_stdcallcc i32 @wWinMain +// WIN64: define dso_local i32 @wWinMain +int wWinMain() { + return 1; +} +// WIN32: define dso_local x86_stdcallcc i32 @DllMain +// WIN64: define dso_local i32 @DllMain +int DllMain() { + return 1; +} +#endif // Windows + +#ifdef EXPLICITCC +// EXPLICITCC: define dso_local x86_fastcallcc i32 @wmain +int __fastcall wmain() { + return 1; +} +// EXPLICITCC: define dso_local x86_fastcallcc i32 @WinMain +int __fastcall WinMain() { + return 1; +} +// EXPLICITCC: define dso_local x86_fastcallcc i32 @wWinMain +int __fastcall wWinMain() { + return 1; +} +// EXPLICITCC: define dso_local x86_fastcallcc i32 @DllMain +int __fastcall DllMain() { + return 1; +} +#endif // ExplicitCC -- GitLab From 8b8b9af8c9132acb446fc42569de8a0f57c6b556 Mon Sep 17 00:00:00 2001 From: Valeriy Savchenko Date: Thu, 18 Mar 2021 14:22:45 +0300 Subject: [PATCH 0022/1000] [-Wcalled-once-parameter][NFC] Fix GCC compilation error --- clang/lib/Analysis/CalledOnceCheck.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/clang/lib/Analysis/CalledOnceCheck.cpp b/clang/lib/Analysis/CalledOnceCheck.cpp index ab56d3e3c988..00bb51a1c0d3 100644 --- a/clang/lib/Analysis/CalledOnceCheck.cpp +++ b/clang/lib/Analysis/CalledOnceCheck.cpp @@ -63,14 +63,14 @@ struct KnownCalledOnceParameter { unsigned ParamIndex; }; constexpr KnownCalledOnceParameter KNOWN_CALLED_ONCE_PARAMETERS[] = { - {"dispatch_async", 1}, - {"dispatch_async_and_wait", 1}, - {"dispatch_after", 2}, - {"dispatch_sync", 1}, - {"dispatch_once", 1}, - {"dispatch_barrier_async", 1}, - {"dispatch_barrier_async_and_wait", 1}, - {"dispatch_barrier_sync", 1}}; + {llvm::StringLiteral{"dispatch_async"}, 1}, + {llvm::StringLiteral{"dispatch_async_and_wait"}, 1}, + {llvm::StringLiteral{"dispatch_after"}, 2}, + {llvm::StringLiteral{"dispatch_sync"}, 1}, + {llvm::StringLiteral{"dispatch_once"}, 1}, + {llvm::StringLiteral{"dispatch_barrier_async"}, 1}, + {llvm::StringLiteral{"dispatch_barrier_async_and_wait"}, 1}, + {llvm::StringLiteral{"dispatch_barrier_sync"}, 1}}; class ParameterStatus { public: -- GitLab From c1fb23c1aadd22e736c4a1c36c146bbfbc48f959 Mon Sep 17 00:00:00 2001 From: Balazs Benics Date: Thu, 18 Mar 2021 13:06:38 +0100 Subject: [PATCH 0023/1000] [clang][ASTImporter] Fix import of VarDecl regarding thread local storage spec After the import, we did not copy the `TSCSpec`. This commit resolves that. Reviewed By: balazske Differential Revision: https://reviews.llvm.org/D98707 --- clang/lib/AST/ASTImporter.cpp | 1 + clang/unittests/AST/ASTImporterTest.cpp | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index f4dfc54b36cb..d48e173eb3b3 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -4018,6 +4018,7 @@ ExpectedDecl ASTNodeImporter::VisitVarDecl(VarDecl *D) { D->getStorageClass())) return ToVar; + ToVar->setTSCSpec(D->getTSCSpec()); ToVar->setQualifierInfo(ToQualifierLoc); ToVar->setAccess(D->getAccess()); ToVar->setLexicalDeclContext(LexicalDC); diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index 39612d43799b..43464cc0c9ca 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -735,6 +735,12 @@ TEST_P(ImportDecl, ImportRecordDeclInFunc) { has(declStmt(hasSingleDecl(varDecl(hasName("d"))))))))); } +TEST_P(ImportDecl, ImportedVarDeclPreservesThreadLocalStorage) { + MatchVerifier Verifier; + testImport("thread_local int declToImport;", Lang_CXX11, "", Lang_CXX11, + Verifier, varDecl(hasThreadStorageDuration())); +} + TEST_P(ASTImporterOptionSpecificTestBase, ImportRecordTypeInFunc) { Decl *FromTU = getTuDecl("int declToImport() { " " struct data_t {int a;int b;};" -- GitLab From c8893f3b784c4b8877275801029b4ebb54408f66 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 18 Mar 2021 08:09:28 -0400 Subject: [PATCH 0024/1000] [LoopVectorize] relax FMF constraint for FP induction This makes the induction part of the loop vectorizer match the reduction part. We do not need all of the fast-math-flags. For example, there are some that clearly are not in play like arcp or afn. If we want to make FMF constraints consistent across the IR optimizer, we might want to add nsz too, but that's up for debate (users can't expect associative FP math and preservation of sign-of-zero at the same time?). The calling code was fixed to avoid miscompiles with: 1bee549737ac Differential Revision: https://reviews.llvm.org/D98708 --- llvm/include/llvm/Analysis/IVDescriptors.h | 5 +- .../LoopVectorize/X86/float-induction-x86.ll | 201 +++++++++++------- .../LoopVectorize/float-induction.ll | 37 +++- 3 files changed, 162 insertions(+), 81 deletions(-) diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h index 41d353dcd573..0a8d5c0d2eae 100644 --- a/llvm/include/llvm/Analysis/IVDescriptors.h +++ b/llvm/include/llvm/Analysis/IVDescriptors.h @@ -305,10 +305,9 @@ public: /// Returns floating-point induction operator that does not allow /// reassociation (transforming the induction requires an override of normal /// floating-point rules). - /// TODO: This should not require the full 'fast' FMF, but caller code - /// may need to be fixed to propagate FMF correctly. Instruction *getExactFPMathInst() { - if (IK == IK_FpInduction && InductionBinOp && !InductionBinOp->isFast()) + if (IK == IK_FpInduction && InductionBinOp && + !InductionBinOp->hasAllowReassoc()) return InductionBinOp; return nullptr; } diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index 9db01e701010..631b43c79340 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -552,88 +552,137 @@ define void @fadd_reassoc_FMF(float* nocapture %p, i32 %N) { ; AUTO_VEC-NEXT: br i1 [[CMP_NOT11]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] ; AUTO_VEC: for.body.preheader: ; AUTO_VEC-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; AUTO_VEC-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -1 -; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP0]], 7 -; AUTO_VEC-NEXT: [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 7 -; AUTO_VEC-NEXT: br i1 [[TMP2]], label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]] -; AUTO_VEC: for.body.preheader.new: -; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP0]], 4294967288 -; AUTO_VEC-NEXT: br label [[FOR_BODY:%.*]] -; AUTO_VEC: for.cond.cleanup.loopexit.unr-lcssa: -; AUTO_VEC-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY]] ] -; AUTO_VEC-NEXT: [[X_012_UNR:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[ADD3_7:%.*]], [[FOR_BODY]] ] +; AUTO_VEC-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 32 +; AUTO_VEC-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] +; AUTO_VEC: vector.ph: +; AUTO_VEC-NEXT: [[N_VEC:%.*]] = and i64 [[TMP0]], 4294967264 +; AUTO_VEC-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float +; AUTO_VEC-NEXT: [[TMP1:%.*]] = fmul reassoc float [[CAST_CRD]], 4.200000e+01 +; AUTO_VEC-NEXT: [[IND_END:%.*]] = fadd reassoc float [[TMP1]], 1.000000e+00 +; AUTO_VEC-NEXT: [[TMP2:%.*]] = add nsw i64 [[N_VEC]], -32 +; AUTO_VEC-NEXT: [[TMP3:%.*]] = lshr exact i64 [[TMP2]], 5 +; AUTO_VEC-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1 +; AUTO_VEC-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP4]], 1 +; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP2]], 0 +; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]] +; AUTO_VEC: vector.ph.new: +; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], 1152921504606846974 +; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]] +; AUTO_VEC: vector.body: +; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND:%.*]] = phi <8 x float> [ , [[VECTOR_PH_NEW]] ], [ [[VEC_IND_NEXT_1:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[VECTOR_PH_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[STEP_ADD:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], +; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], +; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], +; AUTO_VEC-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 [[INDEX]] +; AUTO_VEC-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4 +; AUTO_VEC-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 8 +; AUTO_VEC-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4 +; AUTO_VEC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 16 +; AUTO_VEC-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4 +; AUTO_VEC-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP6]], i64 24 +; AUTO_VEC-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4 +; AUTO_VEC-NEXT: [[TMP14:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] +; AUTO_VEC-NEXT: [[TMP15:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] +; AUTO_VEC-NEXT: [[TMP16:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], [[WIDE_LOAD6]] +; AUTO_VEC-NEXT: [[TMP17:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], [[WIDE_LOAD7]] +; AUTO_VEC-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP6]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP14]], <8 x float>* [[TMP18]], align 4 +; AUTO_VEC-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP15]], <8 x float>* [[TMP19]], align 4 +; AUTO_VEC-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP10]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP16]], <8 x float>* [[TMP20]], align 4 +; AUTO_VEC-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP12]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP17]], <8 x float>* [[TMP21]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3]], +; AUTO_VEC-NEXT: [[STEP_ADD_1:%.*]] = fadd reassoc <8 x float> [[VEC_IND_NEXT]], +; AUTO_VEC-NEXT: [[STEP_ADD2_1:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_1]], +; AUTO_VEC-NEXT: [[STEP_ADD3_1:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_1]], +; AUTO_VEC-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDEX_NEXT]] +; AUTO_VEC-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4 +; AUTO_VEC-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 8 +; AUTO_VEC-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD5_1:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4 +; AUTO_VEC-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 16 +; AUTO_VEC-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD6_1:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4 +; AUTO_VEC-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP22]], i64 24 +; AUTO_VEC-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD7_1:%.*]] = load <8 x float>, <8 x float>* [[TMP29]], align 4 +; AUTO_VEC-NEXT: [[TMP30:%.*]] = fadd reassoc <8 x float> [[VEC_IND_NEXT]], [[WIDE_LOAD_1]] +; AUTO_VEC-NEXT: [[TMP31:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_1]], [[WIDE_LOAD5_1]] +; AUTO_VEC-NEXT: [[TMP32:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_1]], [[WIDE_LOAD6_1]] +; AUTO_VEC-NEXT: [[TMP33:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3_1]], [[WIDE_LOAD7_1]] +; AUTO_VEC-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP22]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP30]], <8 x float>* [[TMP34]], align 4 +; AUTO_VEC-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP24]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP31]], <8 x float>* [[TMP35]], align 4 +; AUTO_VEC-NEXT: [[TMP36:%.*]] = bitcast float* [[TMP26]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP32]], <8 x float>* [[TMP36]], align 4 +; AUTO_VEC-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP28]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP33]], <8 x float>* [[TMP37]], align 4 +; AUTO_VEC-NEXT: [[INDEX_NEXT_1]] = add i64 [[INDEX]], 64 +; AUTO_VEC-NEXT: [[VEC_IND_NEXT_1]] = fadd reassoc <8 x float> [[STEP_ADD3_1]], +; AUTO_VEC-NEXT: [[NITER_NSUB_1]] = add i64 [[NITER]], -2 +; AUTO_VEC-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NSUB_1]], 0 +; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; AUTO_VEC: middle.block.unr-lcssa: +; AUTO_VEC-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ] +; AUTO_VEC-NEXT: [[VEC_IND_UNR:%.*]] = phi <8 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT_1]], [[VECTOR_BODY]] ] ; AUTO_VEC-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0 -; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_EPIL:%.*]] -; AUTO_VEC: for.body.epil: -; AUTO_VEC-NEXT: [[INDVARS_IV_EPIL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_EPIL:%.*]], [[FOR_BODY_EPIL]] ], [ [[INDVARS_IV_UNR]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[X_012_EPIL:%.*]] = phi float [ [[ADD3_EPIL:%.*]], [[FOR_BODY_EPIL]] ], [ [[X_012_UNR]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ [[EPIL_ITER_SUB:%.*]], [[FOR_BODY_EPIL]] ], [ [[XTRAITER]], [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]] ] -; AUTO_VEC-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 [[INDVARS_IV_EPIL]] -; AUTO_VEC-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_EPIL]], align 4 -; AUTO_VEC-NEXT: [[ADD_EPIL:%.*]] = fadd reassoc float [[X_012_EPIL]], [[TMP3]] -; AUTO_VEC-NEXT: store float [[ADD_EPIL]], float* [[ARRAYIDX_EPIL]], align 4 -; AUTO_VEC-NEXT: [[ADD3_EPIL]] = fadd reassoc float [[X_012_EPIL]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_EPIL]] = add nuw nsw i64 [[INDVARS_IV_EPIL]], 1 -; AUTO_VEC-NEXT: [[EPIL_ITER_SUB]] = add i64 [[EPIL_ITER]], -1 -; AUTO_VEC-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[EPIL_ITER_SUB]], 0 -; AUTO_VEC-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_EPIL]], !llvm.loop [[LOOP11:![0-9]+]] +; AUTO_VEC-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]] +; AUTO_VEC: vector.body.epil: +; AUTO_VEC-NEXT: [[STEP_ADD_EPIL:%.*]] = fadd reassoc <8 x float> [[VEC_IND_UNR]], +; AUTO_VEC-NEXT: [[STEP_ADD2_EPIL:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_EPIL]], +; AUTO_VEC-NEXT: [[STEP_ADD3_EPIL:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_EPIL]], +; AUTO_VEC-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDEX_UNR]] +; AUTO_VEC-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP39]], align 4 +; AUTO_VEC-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 8 +; AUTO_VEC-NEXT: [[TMP41:%.*]] = bitcast float* [[TMP40]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD5_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP41]], align 4 +; AUTO_VEC-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 16 +; AUTO_VEC-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD6_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP43]], align 4 +; AUTO_VEC-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[TMP38]], i64 24 +; AUTO_VEC-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* +; AUTO_VEC-NEXT: [[WIDE_LOAD7_EPIL:%.*]] = load <8 x float>, <8 x float>* [[TMP45]], align 4 +; AUTO_VEC-NEXT: [[TMP46:%.*]] = fadd reassoc <8 x float> [[VEC_IND_UNR]], [[WIDE_LOAD_EPIL]] +; AUTO_VEC-NEXT: [[TMP47:%.*]] = fadd reassoc <8 x float> [[STEP_ADD_EPIL]], [[WIDE_LOAD5_EPIL]] +; AUTO_VEC-NEXT: [[TMP48:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2_EPIL]], [[WIDE_LOAD6_EPIL]] +; AUTO_VEC-NEXT: [[TMP49:%.*]] = fadd reassoc <8 x float> [[STEP_ADD3_EPIL]], [[WIDE_LOAD7_EPIL]] +; AUTO_VEC-NEXT: [[TMP50:%.*]] = bitcast float* [[TMP38]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP46]], <8 x float>* [[TMP50]], align 4 +; AUTO_VEC-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP40]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP47]], <8 x float>* [[TMP51]], align 4 +; AUTO_VEC-NEXT: [[TMP52:%.*]] = bitcast float* [[TMP42]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP48]], <8 x float>* [[TMP52]], align 4 +; AUTO_VEC-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP44]] to <8 x float>* +; AUTO_VEC-NEXT: store <8 x float> [[TMP49]], <8 x float>* [[TMP53]], align 4 +; AUTO_VEC-NEXT: br label [[MIDDLE_BLOCK]] +; AUTO_VEC: middle.block: +; AUTO_VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]] +; AUTO_VEC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] ; AUTO_VEC: for.cond.cleanup: ; AUTO_VEC-NEXT: ret void ; AUTO_VEC: for.body: -; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_7]], [[FOR_BODY]] ] -; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ 1.000000e+00, [[FOR_BODY_PREHEADER_NEW]] ], [ [[ADD3_7]], [[FOR_BODY]] ] -; AUTO_VEC-NEXT: [[NITER:%.*]] = phi i64 [ [[UNROLL_ITER]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NSUB_7:%.*]], [[FOR_BODY]] ] +; AUTO_VEC-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; AUTO_VEC-NEXT: [[X_012:%.*]] = phi float [ [[ADD3:%.*]], [[FOR_BODY]] ], [ 1.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; AUTO_VEC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV]] -; AUTO_VEC-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP4]] +; AUTO_VEC-NEXT: [[TMP54:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; AUTO_VEC-NEXT: [[ADD:%.*]] = fadd reassoc float [[X_012]], [[TMP54]] ; AUTO_VEC-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4 -; AUTO_VEC-NEXT: [[ADD3:%.*]] = fadd reassoc float [[X_012]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 -; AUTO_VEC-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV_NEXT]] -; AUTO_VEC-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_1]], align 4 -; AUTO_VEC-NEXT: [[ADD_1:%.*]] = fadd reassoc float [[ADD3]], [[TMP5]] -; AUTO_VEC-NEXT: store float [[ADD_1]], float* [[ARRAYIDX_1]], align 4 -; AUTO_VEC-NEXT: [[ADD3_1:%.*]] = fadd reassoc float [[ADD3]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 -; AUTO_VEC-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV_NEXT_1]] -; AUTO_VEC-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX_2]], align 4 -; AUTO_VEC-NEXT: [[ADD_2:%.*]] = fadd reassoc float [[ADD3_1]], [[TMP6]] -; AUTO_VEC-NEXT: store float [[ADD_2]], float* [[ARRAYIDX_2]], align 4 -; AUTO_VEC-NEXT: [[ADD3_2:%.*]] = fadd reassoc float [[ADD3_1]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 -; AUTO_VEC-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV_NEXT_2]] -; AUTO_VEC-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX_3]], align 4 -; AUTO_VEC-NEXT: [[ADD_3:%.*]] = fadd reassoc float [[ADD3_2]], [[TMP7]] -; AUTO_VEC-NEXT: store float [[ADD_3]], float* [[ARRAYIDX_3]], align 4 -; AUTO_VEC-NEXT: [[ADD3_3:%.*]] = fadd reassoc float [[ADD3_2]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = or i64 [[INDVARS_IV]], 4 -; AUTO_VEC-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV_NEXT_3]] -; AUTO_VEC-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX_4]], align 4 -; AUTO_VEC-NEXT: [[ADD_4:%.*]] = fadd reassoc float [[ADD3_3]], [[TMP8]] -; AUTO_VEC-NEXT: store float [[ADD_4]], float* [[ARRAYIDX_4]], align 4 -; AUTO_VEC-NEXT: [[ADD3_4:%.*]] = fadd reassoc float [[ADD3_3]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = or i64 [[INDVARS_IV]], 5 -; AUTO_VEC-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV_NEXT_4]] -; AUTO_VEC-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX_5]], align 4 -; AUTO_VEC-NEXT: [[ADD_5:%.*]] = fadd reassoc float [[ADD3_4]], [[TMP9]] -; AUTO_VEC-NEXT: store float [[ADD_5]], float* [[ARRAYIDX_5]], align 4 -; AUTO_VEC-NEXT: [[ADD3_5:%.*]] = fadd reassoc float [[ADD3_4]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = or i64 [[INDVARS_IV]], 6 -; AUTO_VEC-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV_NEXT_5]] -; AUTO_VEC-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX_6]], align 4 -; AUTO_VEC-NEXT: [[ADD_6:%.*]] = fadd reassoc float [[ADD3_5]], [[TMP10]] -; AUTO_VEC-NEXT: store float [[ADD_6]], float* [[ARRAYIDX_6]], align 4 -; AUTO_VEC-NEXT: [[ADD3_6:%.*]] = fadd reassoc float [[ADD3_5]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = or i64 [[INDVARS_IV]], 7 -; AUTO_VEC-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[P]], i64 [[INDVARS_IV_NEXT_6]] -; AUTO_VEC-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX_7]], align 4 -; AUTO_VEC-NEXT: [[ADD_7:%.*]] = fadd reassoc float [[ADD3_6]], [[TMP11]] -; AUTO_VEC-NEXT: store float [[ADD_7]], float* [[ARRAYIDX_7]], align 4 -; AUTO_VEC-NEXT: [[ADD3_7]] = fadd reassoc float [[ADD3_6]], 4.200000e+01 -; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV]], 8 -; AUTO_VEC-NEXT: [[NITER_NSUB_7]] = add i64 [[NITER]], -8 -; AUTO_VEC-NEXT: [[NITER_NCMP_7:%.*]] = icmp eq i64 [[NITER_NSUB_7]], 0 -; AUTO_VEC-NEXT: br i1 [[NITER_NCMP_7]], label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY]] +; AUTO_VEC-NEXT: [[ADD3]] = fadd reassoc float [[X_012]], 4.200000e+01 +; AUTO_VEC-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AUTO_VEC-NEXT: [[CMP_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]] +; AUTO_VEC-NEXT: br i1 [[CMP_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; entry: %cmp.not11 = icmp eq i32 %N, 0 diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index 291c01efc3ab..bc4b4b02497e 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -235,10 +235,43 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, float* noalias nocapture %A, i ; VEC1_INTERL2-NEXT: br i1 [[CMP4]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] ; VEC1_INTERL2: for.body.lr.ph: ; VEC1_INTERL2-NEXT: [[FPINC:%.*]] = load float, float* @fp_inc, align 4 +; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; VEC1_INTERL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[TMP0]], 0 +; VEC1_INTERL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; VEC1_INTERL2: vector.ph: +; VEC1_INTERL2-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934590 +; VEC1_INTERL2-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float +; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = fmul reassoc float [[FPINC]], [[CAST_CRD]] +; VEC1_INTERL2-NEXT: [[IND_END:%.*]] = fsub reassoc float [[INIT:%.*]], [[TMP3]] +; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] +; VEC1_INTERL2: vector.body: +; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VEC1_INTERL2-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[TMP4:%.*]] = sitofp i64 [[INDEX]] to float +; VEC1_INTERL2-NEXT: [[TMP5:%.*]] = fmul reassoc float [[FPINC]], [[TMP4]] +; VEC1_INTERL2-NEXT: [[OFFSET_IDX:%.*]] = fsub reassoc float [[INIT]], [[TMP5]] +; VEC1_INTERL2-NEXT: [[TMP6:%.*]] = fmul reassoc float [[FPINC]], 0.000000e+00 +; VEC1_INTERL2-NEXT: [[TMP7:%.*]] = fsub reassoc float [[OFFSET_IDX]], [[TMP6]] +; VEC1_INTERL2-NEXT: [[TMP8:%.*]] = fsub reassoc float [[OFFSET_IDX]], [[FPINC]] +; VEC1_INTERL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION2]] +; VEC1_INTERL2-NEXT: store float [[TMP7]], float* [[TMP9]], align 4 +; VEC1_INTERL2-NEXT: store float [[TMP8]], float* [[TMP10]], align 4 +; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 +; VEC1_INTERL2-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VEC1_INTERL2-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VEC1_INTERL2: middle.block: +; VEC1_INTERL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; VEC1_INTERL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; VEC1_INTERL2: scalar.ph: +; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ] +; VEC1_INTERL2-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[FOR_BODY_LR_PH]] ] ; VEC1_INTERL2-NEXT: br label [[FOR_BODY:%.*]] ; VEC1_INTERL2: for.body: -; VEC1_INTERL2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; VEC1_INTERL2-NEXT: [[X_05:%.*]] = phi float [ [[INIT:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] +; VEC1_INTERL2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; VEC1_INTERL2-NEXT: [[X_05:%.*]] = phi float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] ; VEC1_INTERL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDVARS_IV]] ; VEC1_INTERL2-NEXT: store float [[X_05]], float* [[ARRAYIDX]], align 4 ; VEC1_INTERL2-NEXT: [[ADD]] = fsub reassoc float [[X_05]], [[FPINC]] -- GitLab From e5cd5b352ff481f02e1f4555033edf87112dcc0c Mon Sep 17 00:00:00 2001 From: Thomas Preud'homme Date: Thu, 18 Mar 2021 10:36:15 +0000 Subject: [PATCH 0025/1000] [test] Fix variable definition in acle_sve_ld1.sh Clang test acle_sve_ld1.sh is missing the colon in one of the string variable definition separating the variable name from the regex. This leads the substitution block to be parsed as a numeric variable use. Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D98852 --- clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c index 6475b19ab653..6e3b32e1cc19 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1sh.c @@ -114,7 +114,7 @@ svint32_t test_svld1sh_gather_u32base_s32(svbool_t pg, svuint32_t bases) { svint64_t test_svld1sh_gather_u64base_s64(svbool_t pg, svuint64_t bases) { // CHECK-LABEL: test_svld1sh_gather_u64base_s64 - // CHECK: %[[PG.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) + // CHECK: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv2i1( %pg) // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1.gather.scalar.offset.nxv2i16.nxv2i64( %[[PG]], %bases, i64 0) // CHECK: %[[SEXT:.*]] = sext %[[LOAD]] to // CHECK: ret %[[SEXT]] -- GitLab From c5c4a88a840037fd38cb35d5efd524d51dcc091b Mon Sep 17 00:00:00 2001 From: Sven van Haastregt Date: Thu, 18 Mar 2021 12:17:12 +0000 Subject: [PATCH 0026/1000] [OpenCL] Remove spurious atomic_fetch tablegen builtins The `int` and `long` versions of these builtins already provide the necessary overloads for `intptr_t` and `uintptr_t` arguments, as `ASTContext` defines `atomic_(u)intptr_t` in terms of the `int` or `long` types. Prior to this patch, calls to those builtins with particular argument types resulted in call-is-ambiguous errors. Differential Revision: https://reviews.llvm.org/D98520 --- clang/lib/Sema/OpenCLBuiltins.td | 5 +---- .../SemaOpenCL/fdeclare-opencl-builtins.cl | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/clang/lib/Sema/OpenCLBuiltins.td b/clang/lib/Sema/OpenCLBuiltins.td index d6d77dc90d30..1ff658e567b8 100644 --- a/clang/lib/Sema/OpenCLBuiltins.td +++ b/clang/lib/Sema/OpenCLBuiltins.td @@ -1100,7 +1100,6 @@ let MinVersion = CL20 in { foreach TypePair = [[AtomicInt, Int, Int], [AtomicUInt, UInt, UInt], [AtomicLong, Long, Long], [AtomicULong, ULong, ULong], - [AtomicIntPtr, IntPtr, PtrDiff], [AtomicUIntPtr, UIntPtr, PtrDiff]] in { foreach ModOp = ["add", "sub"] in { def : Builtin<"atomic_fetch_" # ModOp, @@ -1112,9 +1111,7 @@ let MinVersion = CL20 in { } } foreach TypePair = [[AtomicInt, Int, Int], [AtomicUInt, UInt, UInt], - [AtomicLong, Long, Long], [AtomicULong, ULong, ULong], - [AtomicIntPtr, IntPtr, IntPtr], - [AtomicUIntPtr, UIntPtr, UIntPtr]] in { + [AtomicLong, Long, Long], [AtomicULong, ULong, ULong]] in { foreach ModOp = ["or", "xor", "and", "min", "max"] in { def : Builtin<"atomic_fetch_" # ModOp, [TypePair[1], PointerType, GenericAS>, TypePair[2]]>; diff --git a/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl b/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl index 825dd3a935d0..103d1d8b262b 100644 --- a/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl +++ b/clang/test/SemaOpenCL/fdeclare-opencl-builtins.cl @@ -39,6 +39,9 @@ typedef unsigned int uint; typedef unsigned long ulong; typedef unsigned short ushort; typedef __SIZE_TYPE__ size_t; +typedef __PTRDIFF_TYPE__ ptrdiff_t; +typedef __INTPTR_TYPE__ intptr_t; +typedef __UINTPTR_TYPE__ uintptr_t; typedef char char2 __attribute__((ext_vector_type(2))); typedef char char4 __attribute__((ext_vector_type(4))); typedef uchar uchar4 __attribute__((ext_vector_type(4))); @@ -98,6 +101,24 @@ void test_typedef_args(clk_event_t evt, volatile atomic_flag *flg, global unsign size_t ws[2] = {2, 8}; ndrange_t r = ndrange_2D(ws); } + +// Check that atomic_fetch_ functions can be called with (u)intptr_t arguments, +// despite OpenCLBuiltins.td not providing explicit overloads for those types. +void test_atomic_fetch(volatile __generic atomic_int *a_int, + volatile __generic atomic_intptr_t *a_intptr, + volatile __generic atomic_uintptr_t *a_uintptr) { + int i; + intptr_t ip; + uintptr_t uip; + ptrdiff_t ptrdiff; + + i = atomic_fetch_add(a_int, i); + ip = atomic_fetch_add(a_intptr, ptrdiff); + uip = atomic_fetch_add(a_uintptr, ptrdiff); + + ip = atomic_fetch_or(a_intptr, ip); + uip = atomic_fetch_or(a_uintptr, uip); +} #endif kernel void basic_conversion() { -- GitLab From b79044391eb2b58adc34647862f33d3c670fa8e9 Mon Sep 17 00:00:00 2001 From: Thomas Preud'homme Date: Thu, 18 Mar 2021 10:45:55 +0000 Subject: [PATCH 0027/1000] [test] Fix incorrect use of string variable use LLVM test CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll uses a string substitution block that contains a regex matching block. This seems like as a copy/paste from other similar test where the match also defines a variable, hence the [[]] syntax. In this case however this is a CHECK-NOT variable so nothing should match. No variable definition is thus expected and the square brackets can be dropped. Reviewed By: chill Differential Revision: https://reviews.llvm.org/D98853 --- .../test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll index aea36d969108..3c4eff39c60b 100644 --- a/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll +++ b/llvm/test/CodeGen/AArch64/machine-outliner-retaddr-sign-thunk.ll @@ -67,7 +67,7 @@ entry: attributes #0 = { "sign-return-address"="non-leaf" } -; CHECK-NOT: [[OUTLINED_FUNCTION_{{.*}}]] +; CHECK-NOT: OUTLINED_FUNCTION_{{.*}} ; CHECK-NOT: .cfi_b_key_frame ; CHECK-NOT: paci{{[a,b]}}sp ; CHECK-NOT: hint #2{{[5,7]}} -- GitLab From b3a1500ea8007c6ecdca6d502aaba0b03a4f705c Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Thu, 18 Mar 2021 18:28:14 +0700 Subject: [PATCH 0028/1000] [SCEV][NFC] API for predicate evaluation Provides API that allows to check predicate for being true or false with one call. Current implementation is naive and just calls isKnownPredicate twice, but further we can rework this logic trying to use one check to prove both facts. --- llvm/include/llvm/Analysis/ScalarEvolution.h | 13 ++++++++++ llvm/lib/Analysis/ScalarEvolution.cpp | 27 ++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index c35c1db7dfe0..206e502673a9 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -938,11 +938,24 @@ public: bool isKnownPredicate(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS); + /// Check whether the condition described by Pred, LHS, and RHS is true or + /// false. If we know it, return the evaluation of this condition. If neither + /// is proved, return None. + Optional evaluatePredicate(ICmpInst::Predicate Pred, const SCEV *LHS, + const SCEV *RHS); + /// Test if the given expression is known to satisfy the condition described /// by Pred, LHS, and RHS in the given Context. bool isKnownPredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Instruction *Context); + /// Check whether the condition described by Pred, LHS, and RHS is true or + /// false in the given \p Context. If we know it, return the evaluation of + /// this condition. If neither is proved, return None. + Optional evaluatePredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS, + const SCEV *RHS, + const Instruction *Context); + /// Test if the condition described by Pred, LHS, RHS is known to be true on /// every iteration of the loop of the recurrency LHS. bool isKnownOnEveryIteration(ICmpInst::Predicate Pred, diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index ddb56562799e..ecf003319cd2 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -9534,6 +9534,16 @@ bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred, return isKnownViaNonRecursiveReasoning(Pred, LHS, RHS); } +Optional ScalarEvolution::evaluatePredicate(ICmpInst::Predicate Pred, + const SCEV *LHS, + const SCEV *RHS) { + if (isKnownPredicate(Pred, LHS, RHS)) + return true; + else if (isKnownPredicate(ICmpInst::getInversePredicate(Pred), LHS, RHS)) + return false; + return None; +} + bool ScalarEvolution::isKnownPredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Instruction *Context) { @@ -9542,6 +9552,23 @@ bool ScalarEvolution::isKnownPredicateAt(ICmpInst::Predicate Pred, isBasicBlockEntryGuardedByCond(Context->getParent(), Pred, LHS, RHS); } +Optional +ScalarEvolution::evaluatePredicateAt(ICmpInst::Predicate Pred, const SCEV *LHS, + const SCEV *RHS, + const Instruction *Context) { + Optional KnownWithoutContext = evaluatePredicate(Pred, LHS, RHS); + if (KnownWithoutContext) + return KnownWithoutContext; + + if (isBasicBlockEntryGuardedByCond(Context->getParent(), Pred, LHS, RHS)) + return true; + else if (isBasicBlockEntryGuardedByCond(Context->getParent(), + ICmpInst::getInversePredicate(Pred), + LHS, RHS)) + return false; + return None; +} + bool ScalarEvolution::isKnownOnEveryIteration(ICmpInst::Predicate Pred, const SCEVAddRecExpr *LHS, const SCEV *RHS) { -- GitLab From 1067a13cc11fb7e02b337bc669426bcd5958e86b Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Thu, 18 Mar 2021 18:48:10 +0700 Subject: [PATCH 0029/1000] [NFC] Use evaluatePredicate in eliminateComparison Just makes code simpler. --- llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 290c04a7ad10..d0c43bb26105 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -263,12 +263,8 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { // If the condition is always true or always false, replace it with // a constant value. - if (SE->isKnownPredicate(Pred, S, X)) { - ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext())); - DeadInsts.emplace_back(ICmp); - LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); - } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) { - ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext())); + if (auto Ev = SE->evaluatePredicate(Pred, S, X)) { + ICmp->replaceAllUsesWith(ConstantInt::getBool(ICmp->getContext(), *Ev)); DeadInsts.emplace_back(ICmp); LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); } else if (makeIVComparisonInvariant(ICmp, IVOperand)) { -- GitLab From 26ec76add5cf0689dc545ade9a39eef58db6e3d7 Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Thu, 18 Mar 2021 18:50:55 +0700 Subject: [PATCH 0030/1000] [NFC] One more use case for evaluatePredicate --- llvm/lib/Transforms/Utils/LoopPeel.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp index befacb591762..cd1f6f0c78a5 100644 --- a/llvm/lib/Transforms/Utils/LoopPeel.cpp +++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp @@ -211,9 +211,7 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount, // Do not consider predicates that are known to be true or false // independently of the loop iteration. - if (SE.isKnownPredicate(Pred, LeftSCEV, RightSCEV) || - SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), LeftSCEV, - RightSCEV)) + if (SE.evaluatePredicate(Pred, LeftSCEV, RightSCEV)) continue; // Check if we have a condition with one AddRec and one non AddRec -- GitLab From 8e11bede3a6ac11ebcc05c82fac39899feaf9534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 11 Mar 2021 23:44:16 +0200 Subject: [PATCH 0031/1000] [compiler-rt] Produce the right arch suffix for arm libraries If producing libraries with an arch suffix (i.e. if LLVM_ENABLE_PER_TARGET_RUNTIME_DIR isn't set), we append the architecture name. However, for arm, clang doesn't look for libraries with the full architecture name, but only looks for "arm" and "armhf". Try to deduce what the full target triple might have been, and use that for deciding between "arm" and "armhf". This tries to reapply this bit from D98173, that had to be reverted in 7b153b43d3a14d76975039408c4b922beb576735 due to affecting how the builtins themselves are compiled, not only affecting the output file name. Differential Revision: https://reviews.llvm.org/D98452 --- compiler-rt/cmake/Modules/AddCompilerRT.cmake | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/compiler-rt/cmake/Modules/AddCompilerRT.cmake b/compiler-rt/cmake/Modules/AddCompilerRT.cmake index fe4c61abd403..ca2f34e618ab 100644 --- a/compiler-rt/cmake/Modules/AddCompilerRT.cmake +++ b/compiler-rt/cmake/Modules/AddCompilerRT.cmake @@ -124,6 +124,21 @@ macro(set_output_name output name arch) else() if(ANDROID AND ${arch} STREQUAL "i386") set(${output} "${name}-i686${COMPILER_RT_OS_SUFFIX}") + elseif("${arch}" MATCHES "^arm") + if(COMPILER_RT_DEFAULT_TARGET_ONLY) + set(triple "${COMPILER_RT_DEFAULT_TARGET_TRIPLE}") + else() + set(triple "${TARGET_TRIPLE}") + endif() + # When using arch-suffixed runtime library names, clang only looks for + # libraries named "arm" or "armhf", see getArchNameForCompilerRTLib in + # clang. Therefore, try to inspect both the arch name and the triple + # if it seems like we're building an armhf target. + if ("${arch}" MATCHES "hf$" OR "${triple}" MATCHES "hf$") + set(${output} "${name}-armhf${COMPILER_RT_OS_SUFFIX}") + else() + set(${output} "${name}-arm${COMPILER_RT_OS_SUFFIX}") + endif() else() set(${output} "${name}-${arch}${COMPILER_RT_OS_SUFFIX}") endif() -- GitLab From eb4c85e4501e67f48539bed0e622996ec75d1bd1 Mon Sep 17 00:00:00 2001 From: Alexey Lapshin Date: Sun, 27 Dec 2020 16:07:20 +0300 Subject: [PATCH 0032/1000] [llvm-objcopy][NFC][Wasm] Do not use internal buffer while writing into the output. This patch is follow-up for D91028. It implements direct writing into the output stream for wasm. Depends on D91028 Differential Revision: https://reviews.llvm.org/D95478 --- llvm/tools/llvm-objcopy/wasm/Writer.cpp | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/llvm/tools/llvm-objcopy/wasm/Writer.cpp b/llvm/tools/llvm-objcopy/wasm/Writer.cpp index bce24b859573..2fad9e60c50f 100644 --- a/llvm/tools/llvm-objcopy/wasm/Writer.cpp +++ b/llvm/tools/llvm-objcopy/wasm/Writer.cpp @@ -56,29 +56,21 @@ size_t Writer::finalize() { Error Writer::write() { size_t TotalSize = finalize(); - std::unique_ptr Buf = - WritableMemoryBuffer::getNewMemBuffer(TotalSize); - if (!Buf) - return createStringError(errc::not_enough_memory, - "failed to allocate memory buffer of " + - Twine::utohexstr(TotalSize) + " bytes"); + Out.reserveExtraSpace(TotalSize); // Write the header. - uint8_t *Ptr = reinterpret_cast(Buf->getBufferStart()); - Ptr = std::copy(Obj.Header.Magic.begin(), Obj.Header.Magic.end(), Ptr); - support::endian::write32le(Ptr, Obj.Header.Version); - Ptr += sizeof(Obj.Header.Version); + Out.write(Obj.Header.Magic.data(), Obj.Header.Magic.size()); + uint32_t Version; + support::endian::write32le(&Version, Obj.Header.Version); + Out.write(reinterpret_cast(&Version), sizeof(Version)); // Write each section. for (size_t I = 0, S = SectionHeaders.size(); I < S; ++I) { - Ptr = std::copy(SectionHeaders[I].begin(), SectionHeaders[I].end(), Ptr); - ArrayRef Contents = Obj.Sections[I].Contents; - Ptr = std::copy(Contents.begin(), Contents.end(), Ptr); + Out.write(SectionHeaders[I].data(), SectionHeaders[I].size()); + Out.write(reinterpret_cast(Obj.Sections[I].Contents.data()), + Obj.Sections[I].Contents.size()); } - // TODO: Implement direct writing to the output stream (without intermediate - // memory buffer Buf). - Out.write(Buf->getBufferStart(), Buf->getBufferSize()); return Error::success(); } -- GitLab From b3ced9852c7e6cc2dab61b6adb5c92812c99b00e Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 12 Mar 2021 07:39:53 -0800 Subject: [PATCH 0033/1000] [SLP]Fix crash on extending scheduling region. If SLP vectorizer tries to extend the scheduling region and runs out of the budget too early, but still extends the region to the new ending instructions (i.e., it was able to extend the region for the first instruction in the bundle, but not for the second), the compiler need to recalculate dependecies in full, just like if the extending was successfull. Without it, the schedule data chunks may end up with the wrong number of (unscheduled) dependecies and it may end up with the incorrect function, where the vectorized instruction does not dominate on the extractelement instruction. Differential Revision: https://reviews.llvm.org/D98531 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 80 +++++++------- .../X86/crash_exceed_scheduling.ll | 100 ++++++++++++++++++ 2 files changed, 144 insertions(+), 36 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7c10abe09fdf..0ec802799c22 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5179,11 +5179,53 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + auto &&TryScheduleBundle = [this, OldScheduleEnd, SLP](bool ReSchedule, + ScheduleData *Bundle) { + // The scheduling region got new instructions at the lower end (or it is a + // new region for the first bundle). This makes it necessary to + // recalculate all dependencies. + // It is seldom that this needs to be done a second time after adding the + // initial bundle to the region. + if (ScheduleEnd != OldScheduleEnd) { + for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) + doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); }); + ReSchedule = true; + } + if (ReSchedule) { + resetSchedule(); + initialFillReadyList(ReadyInsts); + } + if (Bundle) { + LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle + << " in block " << BB->getName() << "\n"); + calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP); + } + + // Now try to schedule the new bundle or (if no bundle) just calculate + // dependencies. As soon as the bundle is "ready" it means that there are no + // cyclic dependencies and we can schedule it. Note that's important that we + // don't "schedule" the bundle yet (see cancelScheduling). + while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) && + !ReadyInsts.empty()) { + ScheduleData *Picked = ReadyInsts.pop_back_val(); + if (Picked->isSchedulingEntity() && Picked->isReady()) + schedule(Picked, ReadyInsts); + } + }; + // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (!extendSchedulingRegion(V, S)) + if (!extendSchedulingRegion(V, S)) { + // If the scheduling region got new instructions at the lower end (or it + // is a new region for the first bundle). This makes it necessary to + // recalculate all dependencies. + // Otherwise the compiler may crash trying to incorrectly calculate + // dependencies and emit instruction in the wrong order at the actual + // scheduling. + TryScheduleBundle(/*ReSchedule=*/false, nullptr); return None; + } } for (Value *V : VL) { @@ -5212,42 +5254,8 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, BundleMember->FirstInBundle = Bundle; PrevInBundle = BundleMember; } - if (ScheduleEnd != OldScheduleEnd) { - // The scheduling region got new instructions at the lower end (or it is a - // new region for the first bundle). This makes it necessary to - // recalculate all dependencies. - // It is seldom that this needs to be done a second time after adding the - // initial bundle to the region. - for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - doForAllOpcodes(I, [](ScheduleData *SD) { - SD->clearDependencies(); - }); - } - ReSchedule = true; - } - if (ReSchedule) { - resetSchedule(); - initialFillReadyList(ReadyInsts); - } assert(Bundle && "Failed to find schedule bundle"); - - LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block " - << BB->getName() << "\n"); - - calculateDependencies(Bundle, true, SLP); - - // Now try to schedule the new bundle. As soon as the bundle is "ready" it - // means that there are no cyclic dependencies and we can schedule it. - // Note that's important that we don't "schedule" the bundle yet (see - // cancelScheduling). - while (!Bundle->isReady() && !ReadyInsts.empty()) { - - ScheduleData *pickedSD = ReadyInsts.pop_back_val(); - - if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) { - schedule(pickedSD, ReadyInsts); - } - } + TryScheduleBundle(ReSchedule, Bundle); if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); return None; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll new file mode 100644 index 000000000000..299c2d3642c4 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -slp-min-tree-size=2 -slp-threshold=-1000 -slp-max-look-ahead-depth=1 -slp-look-ahead-users-budget=1 -slp-schedule-budget=27 -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define void @exceed(double %0, double %1) { +; CHECK-LABEL: @exceed( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[IXX0:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX1:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX2:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX3:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX4:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX5:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef +; CHECK-NEXT: [[IXX10:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX11:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX12:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX13:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX14:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX15:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX20:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX21:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[IXX22:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double undef, i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x double> , double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = fmul fast <2 x double> [[TMP14]], [[TMP15]] +; CHECK-NEXT: switch i32 undef, label [[BB1:%.*]] [ +; CHECK-NEXT: i32 0, label [[BB2:%.*]] +; CHECK-NEXT: ] +; CHECK: bb1: +; CHECK-NEXT: br label [[LABEL:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[TMP16]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x double> poison, double [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP18]], double [[TMP19]], i32 1 +; CHECK-NEXT: br label [[LABEL]] +; CHECK: label: +; CHECK-NEXT: [[TMP21:%.*]] = phi <2 x double> [ [[TMP12]], [[BB1]] ], [ [[TMP20]], [[BB2]] ] +; CHECK-NEXT: ret void +; +entry: + %i10 = fdiv fast double %0, %1 + %ix = fmul double %i10, undef + %ixx0 = fsub double undef, undef + %ixx1 = fsub double undef, undef + %ixx2 = fsub double undef, undef + %ixx3 = fsub double undef, undef + %ixx4 = fsub double undef, undef + %ixx5 = fsub double undef, undef + %ix1 = fmul double %i10, undef + %ixx10 = fsub double undef, undef + %ixx11 = fsub double undef, undef + %ixx12 = fsub double undef, undef + %ixx13 = fsub double undef, undef + %ixx14 = fsub double undef, undef + %ixx15 = fsub double undef, undef + %ixx20 = fsub double undef, undef + %ixx21 = fsub double undef, undef + %ixx22 = fsub double undef, undef + %i11 = fdiv fast double %0, %1 + %ix2 = fmul double %i11, %i11 + %tmp1 = fadd fast double %i11, %0 + %tmp2 = fadd fast double %0, %1 + %tmp5 = fmul fast double %tmp1, %tmp2 + %tmp15 = fadd fast double %i10, %1 + %tmp25 = fadd fast double %0, %1 + %tmp6 = fmul fast double %tmp15, %tmp25 + %tmp555 = fmul fast double %i10, undef + %ixx101 = fsub double undef, undef + %tmp666 = fmul fast double %1, undef + switch i32 undef, label %bb1 [ + i32 0, label %bb2 + ] + +bb1: ; preds = %entry + br label %label + +bb2: ; preds = %entry + br label %label + +label: ; preds = %bb2, %bb1 + %phi1 = phi double [ %tmp5, %bb1 ], [ %tmp555, %bb2 ] + %phi2 = phi double [ %tmp6, %bb1 ], [ %tmp666, %bb2 ] + ret void +} -- GitLab From 61f834cc0937c4532e5679f95b2a44d529a4d8bf Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 12 Mar 2021 11:06:18 -0500 Subject: [PATCH 0034/1000] GlobalISel: Insert memcpy for outgoing byval arguments byval requires an implicit copy between the caller and callee such that the callee may write into the stack area without it modifying the value in the parent. Previously, this was passing through the raw pointer value which would break if the callee wrote into it. Most of the time, this copy can be optimized out (however we don't have the optimization SelectionDAG does yet). This will trigger more fallbacks for AMDGPU now, since we don't have legalization for memcpy yet (although we should stop using byval anyway). --- .../llvm/CodeGen/GlobalISel/CallLowering.h | 8 ++ .../CodeGen/GlobalISel/MachineIRBuilder.h | 21 ++++++ llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 67 ++++++++++++++--- .../CodeGen/AArch64/GlobalISel/byval-call.ll | 75 +++++++++++++++++++ .../AMDGPU/GlobalISel/irtranslator-call.ll | 65 +++++++++++++++- 5 files changed, 225 insertions(+), 11 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h index 5b296086ef2a..f63033cf6136 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h @@ -187,6 +187,14 @@ public: llvm_unreachable("Custom values not supported"); } + /// Do a memory copy of \p MemSize bytes from \p SrcPtr to \p DstPtr. This + /// is necessary for outgoing stack-passed byval arguments. + void + copyArgumentMemory(const ArgInfo &Arg, Register DstPtr, Register SrcPtr, + const MachinePointerInfo &DstPtrInfo, Align DstAlign, + const MachinePointerInfo &SrcPtrInfo, Align SrcAlign, + uint64_t MemSize, CCValAssign &VA) const; + /// Extend a register to the location type given in VA, capped at extending /// to at most MaxSize bits. If MaxSizeBits is 0 then no maximum is set. Register extendRegister(Register ValReg, CCValAssign &VA, diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index 6c64cd5cb208..c916ff14aa14 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1810,6 +1810,27 @@ public: MachineInstrBuilder buildVecReduceUMin(const DstOp &Dst, const SrcOp &Src) { return buildInstr(TargetOpcode::G_VECREDUCE_UMIN, {Dst}, {Src}); } + + /// Build and insert G_MEMCPY or G_MEMMOVE + MachineInstrBuilder buildMemTransferInst(unsigned Opcode, const SrcOp &DstPtr, + const SrcOp &SrcPtr, + const SrcOp &Size, + MachineMemOperand &DstMMO, + MachineMemOperand &SrcMMO) { + auto MIB = buildInstr( + Opcode, {}, {DstPtr, SrcPtr, Size, SrcOp(INT64_C(0) /*isTailCall*/)}); + MIB.addMemOperand(&DstMMO); + MIB.addMemOperand(&SrcMMO); + return MIB; + } + + MachineInstrBuilder buildMemCpy(const SrcOp &DstPtr, const SrcOp &SrcPtr, + const SrcOp &Size, MachineMemOperand &DstMMO, + MachineMemOperand &SrcMMO) { + return buildMemTransferInst(TargetOpcode::G_MEMCPY, DstPtr, SrcPtr, Size, + DstMMO, SrcMMO); + } + virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef DstOps, ArrayRef SrcOps, Optional Flags = None); diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index f689801fa30f..601d087e0453 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -647,17 +647,43 @@ bool CallLowering::handleAssignments(CCState &CCInfo, } if (VA.isMemLoc() && Flags.isByVal()) { - // FIXME: We should be inserting a memcpy from the source pointer to the - // result for outgoing byval parameters. - if (!Handler.isIncomingArgumentHandler()) - continue; - - MachinePointerInfo MPO; - Register StackAddr = Handler.getStackAddress( - Flags.getByValSize(), VA.getLocMemOffset(), MPO, Flags); assert(Args[i].Regs.size() == 1 && "didn't expect split byval pointer"); - MIRBuilder.buildCopy(Args[i].Regs[0], StackAddr); + + if (Handler.isIncomingArgumentHandler()) { + // We just need to copy the frame index value to the pointer. + MachinePointerInfo MPO; + Register StackAddr = Handler.getStackAddress( + Flags.getByValSize(), VA.getLocMemOffset(), MPO, Flags); + MIRBuilder.buildCopy(Args[i].Regs[0], StackAddr); + } else { + // For outgoing byval arguments, insert the implicit copy byval + // implies, such that writes in the callee do not modify the caller's + // value. + uint64_t MemSize = Flags.getByValSize(); + int64_t Offset = VA.getLocMemOffset(); + + MachinePointerInfo DstMPO; + Register StackAddr = + Handler.getStackAddress(MemSize, Offset, DstMPO, Flags); + + const LLT PtrTy = MRI.getType(StackAddr); + + // FIXME: We do not have access to the original IR value here to + // preserve the aliasing information. + MachinePointerInfo SrcMPO(PtrTy.getAddressSpace()); + + Align DstAlign = std::max(Flags.getNonZeroByValAlign(), + inferAlignFromPtrInfo(MF, DstMPO)); + + // TODO: Theoretically the source value could have a higher alignment, + // but we don't have that here + Align SrcAlign = Flags.getNonZeroByValAlign(); + + Handler.copyArgumentMemory(Args[i], StackAddr, Args[i].Regs[0], + DstMPO, DstAlign, SrcMPO, SrcAlign, + MemSize, VA); + } continue; } @@ -963,6 +989,29 @@ bool CallLowering::resultsCompatible(CallLoweringInfo &Info, return true; } +void CallLowering::ValueHandler::copyArgumentMemory( + const ArgInfo &Arg, Register DstPtr, Register SrcPtr, + const MachinePointerInfo &DstPtrInfo, Align DstAlign, + const MachinePointerInfo &SrcPtrInfo, Align SrcAlign, uint64_t MemSize, + CCValAssign &VA) const { + MachineFunction &MF = MIRBuilder.getMF(); + MachineMemOperand *SrcMMO = MF.getMachineMemOperand( + SrcPtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable, MemSize, + SrcAlign); + + MachineMemOperand *DstMMO = MF.getMachineMemOperand( + DstPtrInfo, + MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable, + MemSize, DstAlign); + + const LLT PtrTy = MRI.getType(DstPtr); + const LLT SizeTy = LLT::scalar(PtrTy.getSizeInBits()); + + auto SizeConst = MIRBuilder.buildConstant(SizeTy, MemSize); + MIRBuilder.buildMemCpy(DstPtr, SrcPtr, SizeConst, *DstMMO, *SrcMMO); +} + Register CallLowering::ValueHandler::extendRegister(Register ValReg, CCValAssign &VA, unsigned MaxSizeBits) { diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll b/llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll new file mode 100644 index 000000000000..778c823552a1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s + +declare void @byval_i32(i32* byval(i32) %ptr) + +define void @call_byval_i32(i32* %incoming) { +; CHECK-LABEL: call_byval_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #32 // =32 +; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: ldr w8, [x0] +; CHECK-NEXT: str w8, [sp] +; CHECK-NEXT: bl byval_i32 +; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #32 // =32 +; CHECK-NEXT: ret + call void @byval_i32(i32* byval(i32) %incoming) + ret void +} + +declare void @byval_a64i32([64 x i32]* byval([64 x i32]) %ptr) + +define void @call_byval_a64i32([64 x i32]* %incoming) { +; CHECK-LABEL: call_byval_a64i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #288 // =288 +; CHECK-NEXT: stp x29, x30, [sp, #256] // 16-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #272] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #256 // =256 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: ldr q0, [x0] +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: ldr q0, [x0, #16] +; CHECK-NEXT: str q0, [sp, #16] +; CHECK-NEXT: ldr q0, [x0, #32] +; CHECK-NEXT: str q0, [sp, #32] +; CHECK-NEXT: ldr q0, [x0, #48] +; CHECK-NEXT: str q0, [sp, #48] +; CHECK-NEXT: ldr q0, [x0, #64] +; CHECK-NEXT: str q0, [sp, #64] +; CHECK-NEXT: ldr q0, [x0, #80] +; CHECK-NEXT: str q0, [sp, #80] +; CHECK-NEXT: ldr q0, [x0, #96] +; CHECK-NEXT: str q0, [sp, #96] +; CHECK-NEXT: ldr q0, [x0, #112] +; CHECK-NEXT: str q0, [sp, #112] +; CHECK-NEXT: ldr q0, [x0, #128] +; CHECK-NEXT: str q0, [sp, #128] +; CHECK-NEXT: ldr q0, [x0, #144] +; CHECK-NEXT: str q0, [sp, #144] +; CHECK-NEXT: ldr q0, [x0, #160] +; CHECK-NEXT: str q0, [sp, #160] +; CHECK-NEXT: ldr q0, [x0, #176] +; CHECK-NEXT: str q0, [sp, #176] +; CHECK-NEXT: ldr q0, [x0, #192] +; CHECK-NEXT: str q0, [sp, #192] +; CHECK-NEXT: ldr q0, [x0, #208] +; CHECK-NEXT: str q0, [sp, #208] +; CHECK-NEXT: ldr q0, [x0, #224] +; CHECK-NEXT: str q0, [sp, #224] +; CHECK-NEXT: ldr q0, [x0, #240] +; CHECK-NEXT: str q0, [sp, #240] +; CHECK-NEXT: bl byval_a64i32 +; CHECK-NEXT: ldr x28, [sp, #272] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #288 // =288 +; CHECK-NEXT: ret + call void @byval_a64i32([64 x i32]* byval([64 x i32]) %incoming) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 79a346240c17..bf632f035572 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -3912,8 +3912,13 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 20 ; CHECK: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C5]](s32) ; CHECK: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]] - ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg - ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK: [[COPY20:%[0-9]+]]:_(p5) = COPY $sp_reg + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C6]](s32) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store 8 into stack, align 4, addrspace 5), (dereferenceable load 8, align 4, addrspace 5) + ; CHECK: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>) ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) ; CHECK: $sgpr6_sgpr7 = COPY [[COPY11]](p4) ; CHECK: $sgpr8_sgpr9 = COPY [[PTR_ADD1]](p4) @@ -3934,6 +3939,62 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ret void } +declare void @void_func_byval_a3i32_byval_i8_align32([3 x i32] addrspace(5)* byval([3 x i32]) %arg0, i8 addrspace(5)* byval(i8) align 32 %arg1, i32 %arg2) #0 + +define void @call_byval_3ai32_byval_i8_align32([3 x i32] addrspace(5)* %incoming0, i8 addrspace(5)* align 32 %incoming1) #0 { + ; CHECK-LABEL: name: call_byval_3ai32_byval_i8_align32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr1, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK: [[COPY8:%[0-9]+]]:_(p5) = COPY $vgpr0 + ; CHECK: [[COPY9:%[0-9]+]]:_(p5) = COPY $vgpr1 + ; CHECK: [[COPY10:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 999 + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_func_byval_a3i32_byval_i8_align32 + ; CHECK: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; CHECK: [[COPY13:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; CHECK: [[COPY14:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY19:%[0-9]+]]:_(p5) = COPY $sgpr32 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; CHECK: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C2]](s32), 0 :: (dereferenceable store 12 into stack, align 4, addrspace 5), (dereferenceable load 12, align 4, addrspace 5) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: G_MEMCPY [[PTR_ADD1]](p5), [[COPY9]](p5), [[C4]](s32), 0 :: (dereferenceable store 1 into stack + 32, align 32, addrspace 5), (dereferenceable load 1, align 32, addrspace 5) + ; CHECK: $vgpr0 = COPY [[C]](s32) + ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) + ; CHECK: $sgpr4_sgpr5 = COPY [[COPY11]](p4) + ; CHECK: $sgpr6_sgpr7 = COPY [[COPY12]](p4) + ; CHECK: $sgpr8_sgpr9 = COPY [[COPY13]](p4) + ; CHECK: $sgpr10_sgpr11 = COPY [[COPY14]](s64) + ; CHECK: $sgpr12 = COPY [[COPY15]](s32) + ; CHECK: $sgpr13 = COPY [[COPY16]](s32) + ; CHECK: $sgpr14 = COPY [[COPY17]](s32) + ; CHECK: $vgpr31 = COPY [[COPY18]](s32) + ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @void_func_byval_a3i32_byval_i8_align32, csr_amdgpu_highregs, implicit $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 + ; CHECK: ADJCALLSTACKDOWN 0, 36, implicit-def $scc + ; CHECK: [[COPY21:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY10]] + ; CHECK: S_SETPC_B64_return [[COPY21]] + call void @void_func_byval_a3i32_byval_i8_align32([3 x i32] addrspace(5)* byval([3 x i32]) %incoming0, i8 addrspace(5)* align 32 %incoming1, i32 999) + ret void +} + define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i8 ; CHECK: bb.1 (%ir-block.0): -- GitLab From b9a03849836f6409291025a31089bfabfa96dd0b Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 14 Mar 2021 10:26:31 -0400 Subject: [PATCH 0035/1000] GlobalISel: Preserve source value information for outgoing byval args Pass through the original argument IR value in order to preserve the aliasing information in the memcpy memory operands. --- .../llvm/CodeGen/GlobalISel/CallLowering.h | 18 +++++- llvm/lib/CodeGen/GlobalISel/CallLowering.cpp | 22 +++---- llvm/lib/CodeGen/GlobalISel/Utils.cpp | 5 ++ .../AArch64/GISel/AArch64CallLowering.cpp | 2 +- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp | 2 +- .../AMDGPU/GlobalISel/irtranslator-call.ll | 57 ++++++++++++++++++- 6 files changed, 88 insertions(+), 18 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h index f63033cf6136..868980d24fc2 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h @@ -23,6 +23,7 @@ #include "llvm/IR/Attributes.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" #include @@ -38,7 +39,6 @@ class MachineIRBuilder; struct MachinePointerInfo; class MachineRegisterInfo; class TargetLowering; -class Value; class CallLowering { const TargetLowering *TLI; @@ -65,10 +65,17 @@ public: // if the argument was an incoming arg. SmallVector OrigRegs; + /// Optionally track the original IR value for the argument. This may not be + /// meaningful in all contexts. This should only be used on for forwarding + /// through to use for aliasing information in MachinePointerInfo for memory + /// arguments. + const Value *OrigValue = nullptr; + ArgInfo(ArrayRef Regs, Type *Ty, ArrayRef Flags = ArrayRef(), - bool IsFixed = true) - : BaseArgInfo(Ty, Flags, IsFixed), Regs(Regs.begin(), Regs.end()) { + bool IsFixed = true, const Value *OrigValue = nullptr) + : BaseArgInfo(Ty, Flags, IsFixed), Regs(Regs.begin(), Regs.end()), + OrigValue(OrigValue) { if (!Regs.empty() && Flags.empty()) this->Flags.push_back(ISD::ArgFlagsTy()); // FIXME: We should have just one way of saying "no register". @@ -77,6 +84,11 @@ public: "only void types should have no register"); } + ArgInfo(ArrayRef Regs, const Value &OrigValue, + ArrayRef Flags = ArrayRef(), + bool IsFixed = true) + : ArgInfo(Regs, OrigValue.getType(), Flags, IsFixed, &OrigValue) {} + ArgInfo() : BaseArgInfo() {} }; diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp index 601d087e0453..808be0ff6381 100644 --- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp @@ -112,7 +112,7 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB, unsigned i = 0; unsigned NumFixedArgs = CB.getFunctionType()->getNumParams(); for (auto &Arg : CB.args()) { - ArgInfo OrigArg{ArgRegs[i], Arg->getType(), getAttributesForArgIdx(CB, i), + ArgInfo OrigArg{ArgRegs[i], *Arg.get(), getAttributesForArgIdx(CB, i), i < NumFixedArgs}; setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB); @@ -204,7 +204,8 @@ void CallLowering::splitToValueTypes(const ArgInfo &OrigArg, // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), - OrigArg.Flags[0], OrigArg.IsFixed); + OrigArg.Flags[0], OrigArg.IsFixed, + OrigArg.OrigValue); return; } @@ -667,18 +668,19 @@ bool CallLowering::handleAssignments(CCState &CCInfo, Register StackAddr = Handler.getStackAddress(MemSize, Offset, DstMPO, Flags); - const LLT PtrTy = MRI.getType(StackAddr); - - // FIXME: We do not have access to the original IR value here to - // preserve the aliasing information. - MachinePointerInfo SrcMPO(PtrTy.getAddressSpace()); + MachinePointerInfo SrcMPO(Args[i].OrigValue); + if (!Args[i].OrigValue) { + // We still need to accurately track the stack address space if we + // don't know the underlying value. + const LLT PtrTy = MRI.getType(StackAddr); + SrcMPO = MachinePointerInfo(PtrTy.getAddressSpace()); + } Align DstAlign = std::max(Flags.getNonZeroByValAlign(), inferAlignFromPtrInfo(MF, DstMPO)); - // TODO: Theoretically the source value could have a higher alignment, - // but we don't have that here - Align SrcAlign = Flags.getNonZeroByValAlign(); + Align SrcAlign = std::max(Flags.getNonZeroByValAlign(), + inferAlignFromPtrInfo(MF, SrcMPO)); Handler.copyArgumentMemory(Args[i], StackAddr, Args[i].Regs[0], DstMPO, DstAlign, SrcMPO, SrcAlign, diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index 5d062820a49f..067018ba2cff 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -552,6 +552,11 @@ Align llvm::inferAlignFromPtrInfo(MachineFunction &MF, MPO.Offset); } + if (const Value *V = MPO.V.dyn_cast()) { + const Module *M = MF.getFunction().getParent(); + return V->getPointerAlignment(M->getDataLayout()); + } + return Align(1); } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp index b97e63f51d1e..ef0d4c6ee93c 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -462,7 +462,7 @@ bool AArch64CallLowering::lowerFormalArguments( if (DL.getTypeStoreSize(Arg.getType()).isZero()) continue; - ArgInfo OrigArg{VRegs[i], Arg.getType()}; + ArgInfo OrigArg{VRegs[i], Arg}; setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, F); splitToValueTypes(OrigArg, SplitArgs, DL, F.getCallingConv()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index c7c4ed45589f..a942a740535a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -656,7 +656,7 @@ bool AMDGPUCallLowering::lowerFormalArguments( } } - ArgInfo OrigArg(VRegs[Idx], Arg.getType()); + ArgInfo OrigArg(VRegs[Idx], Arg); const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; setArgFlags(OrigArg, OrigArgIdx, DL, F); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index bf632f035572..c0807b83f841 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -3916,7 +3916,7 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY20]], [[C6]](s32) ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; CHECK: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store 8 into stack, align 4, addrspace 5), (dereferenceable load 8, align 4, addrspace 5) + ; CHECK: G_MEMCPY [[PTR_ADD2]](p5), [[FRAME_INDEX]](p5), [[C7]](s32), 0 :: (dereferenceable store 8 into stack, align 4, addrspace 5), (dereferenceable load 8 from %ir.val, align 4, addrspace 5) ; CHECK: [[COPY21:%[0-9]+]]:_(<4 x s32>) = COPY $private_rsrc_reg ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY21]](<4 x s32>) ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) @@ -3971,11 +3971,11 @@ define void @call_byval_3ai32_byval_i8_align32([3 x i32] addrspace(5)* %incoming ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C1]](s32) ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; CHECK: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C2]](s32), 0 :: (dereferenceable store 12 into stack, align 4, addrspace 5), (dereferenceable load 12, align 4, addrspace 5) + ; CHECK: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C2]](s32), 0 :: (dereferenceable store 12 into stack, align 4, addrspace 5), (dereferenceable load 12 from %ir.incoming0, align 4, addrspace 5) ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 ; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY19]], [[C3]](s32) ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; CHECK: G_MEMCPY [[PTR_ADD1]](p5), [[COPY9]](p5), [[C4]](s32), 0 :: (dereferenceable store 1 into stack + 32, align 32, addrspace 5), (dereferenceable load 1, align 32, addrspace 5) + ; CHECK: G_MEMCPY [[PTR_ADD1]](p5), [[COPY9]](p5), [[C4]](s32), 0 :: (dereferenceable store 1 into stack + 32, align 32, addrspace 5), (dereferenceable load 1 from %ir.incoming1, align 32, addrspace 5) ; CHECK: $vgpr0 = COPY [[C]](s32) ; CHECK: [[COPY20:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY20]](<4 x s32>) @@ -3995,6 +3995,57 @@ define void @call_byval_3ai32_byval_i8_align32([3 x i32] addrspace(5)* %incoming ret void } +declare void @void_func_byval_a4i64_align4([4 x i64] addrspace(5)* byval([4 x i64]) align 4 %arg0) #0 + +; Make sure we are aware of the higher alignment of the incoming value +; than implied by the outgoing byval alignment in the memory operand. +define void @call_byval_a4i64_align4_higher_source_align([4 x i64] addrspace(5)* align 256 %incoming_high_align) #0 { + ; CHECK-LABEL: name: call_byval_a4i64_align4_higher_source_align + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr0, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr14 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr13 + ; CHECK: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr12 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 + ; CHECK: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 + ; CHECK: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 + ; CHECK: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 + ; CHECK: [[COPY8:%[0-9]+]]:_(p5) = COPY $vgpr0 + ; CHECK: [[COPY9:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc + ; CHECK: [[GV:%[0-9]+]]:sreg_64(p0) = G_GLOBAL_VALUE @void_func_byval_a4i64_align4 + ; CHECK: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY7]] + ; CHECK: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY6]] + ; CHECK: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY5]] + ; CHECK: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY4]] + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY3]] + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY2]] + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY1]] + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY18:%[0-9]+]]:_(p5) = COPY $sgpr32 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY18]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK: G_MEMCPY [[PTR_ADD]](p5), [[COPY8]](p5), [[C1]](s32), 0 :: (dereferenceable store 32 into stack, align 4, addrspace 5), (dereferenceable load 32 from %ir.incoming_high_align, align 256, addrspace 5) + ; CHECK: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) + ; CHECK: $sgpr4_sgpr5 = COPY [[COPY10]](p4) + ; CHECK: $sgpr6_sgpr7 = COPY [[COPY11]](p4) + ; CHECK: $sgpr8_sgpr9 = COPY [[COPY12]](p4) + ; CHECK: $sgpr10_sgpr11 = COPY [[COPY13]](s64) + ; CHECK: $sgpr12 = COPY [[COPY14]](s32) + ; CHECK: $sgpr13 = COPY [[COPY15]](s32) + ; CHECK: $sgpr14 = COPY [[COPY16]](s32) + ; CHECK: $vgpr31 = COPY [[COPY17]](s32) + ; CHECK: $sgpr30_sgpr31 = SI_CALL [[GV]](p0), @void_func_byval_a4i64_align4, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $vgpr31 + ; CHECK: ADJCALLSTACKDOWN 0, 32, implicit-def $scc + ; CHECK: [[COPY20:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY9]] + ; CHECK: S_SETPC_B64_return [[COPY20]] + call void @void_func_byval_a4i64_align4([4 x i64] addrspace(5)* byval([4 x i64]) align 4 %incoming_high_align) + ret void +} + define amdgpu_kernel void @test_call_external_void_func_v2i8() #0 { ; CHECK-LABEL: name: test_call_external_void_func_v2i8 ; CHECK: bb.1 (%ir-block.0): -- GitLab From 758efce346c802953ef17ecd7b9a571e53457f13 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Mar 2021 13:31:20 +0000 Subject: [PATCH 0036/1000] [X86][SSE] Regenerate PR18054 test case --- llvm/test/CodeGen/X86/pr18054.ll | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/X86/pr18054.ll b/llvm/test/CodeGen/X86/pr18054.ll index b7af51618047..0c5079a93170 100644 --- a/llvm/test/CodeGen/X86/pr18054.ll +++ b/llvm/test/CodeGen/X86/pr18054.ll @@ -1,10 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=penryn | FileCheck %s define void @foo(<16 x i32>* %p, <16 x i1> %x) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3] +; CHECK-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; CHECK-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; CHECK-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: pslld $31, %xmm0 +; CHECK-NEXT: psrad $31, %xmm0 +; CHECK-NEXT: pslld $31, %xmm3 +; CHECK-NEXT: psrad $31, %xmm3 +; CHECK-NEXT: pslld $31, %xmm2 +; CHECK-NEXT: psrad $31, %xmm2 +; CHECK-NEXT: pslld $31, %xmm1 +; CHECK-NEXT: psrad $31, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rdi) +; CHECK-NEXT: movdqa %xmm2, 48(%rdi) +; CHECK-NEXT: movdqa %xmm3, 32(%rdi) +; CHECK-NEXT: movdqa %xmm0, 16(%rdi) +; CHECK-NEXT: retq %ret = sext <16 x i1> %x to <16 x i32> store <16 x i32> %ret, <16 x i32>* %p ret void -; CHECK: foo -; CHECK-NOT: pmovsxbd -; CHECK: ret } -- GitLab From de155f4af2b5f0916b8f2d745e6da520bb7e1058 Mon Sep 17 00:00:00 2001 From: David Truby Date: Wed, 17 Mar 2021 08:55:42 +0000 Subject: [PATCH 0037/1000] [MLIR][OpenMP] Pretty printer and parser for omp.wsloop Co-authored-by: Kiran Chandramohan Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D92327 --- mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 36 +- mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 446 ++++++++++++++++-- .../OpenMPToLLVM/convert-to-llvmir.mlir | 4 +- .../Conversion/SCFToOpenMP/scf-to-openmp.mlir | 18 +- mlir/test/Dialect/OpenMP/ops.mlir | 209 ++++++-- 5 files changed, 617 insertions(+), 96 deletions(-) diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td index 24a06c4d0d00..6c1f5c0e7f10 100644 --- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td +++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td @@ -116,7 +116,8 @@ def TerminatorOp : OpenMP_Op<"terminator", [Terminator]> { // 2.9.2 Workshare Loop Construct //===----------------------------------------------------------------------===// -def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments]> { +def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments, + AllTypesMatch<["lowerBound", "upperBound", "step"]>]> { let summary = "workshare loop construct"; let description = [{ The workshare loop construct specifies that the iterations of the loop(s) @@ -130,13 +131,13 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments]> { by "omp.yield" instruction without operands. ``` - omp.wsloop (%i1, %i2) = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) { - %a = load %arrA[%i1, %i2] : memref - %b = load %arrB[%i1, %i2] : memref - %sum = addf %a, %b : f32 - store %sum, %arrC[%i1, %i2] : memref - omp.yield - } + omp.wsloop (%i1, %i2) : index = (%c0, %c0) to (%c10, %c10) step (%c1, %c1) { + %a = load %arrA[%i1, %i2] : memref + %b = load %arrB[%i1, %i2] : memref + %sum = addf %a, %b : f32 + store %sum, %arrC[%i1, %i2] : memref + omp.yield + } ``` `private_vars`, `firstprivate_vars`, `lastprivate_vars` and `linear_vars` @@ -181,10 +182,23 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments]> { OptionalAttr:$order_val, UnitAttr:$inclusive); + let skipDefaultBuilders = 1; + let builders = [ OpBuilder<(ins "ValueRange":$lowerBound, "ValueRange":$upperBound, - "ValueRange":$step, - CArg<"ArrayRef", "{}">:$attributes)> + "ValueRange":$step, + CArg<"ArrayRef", "{}">:$attributes)>, + OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$lowerBound, + "ValueRange":$upperBound, "ValueRange":$step, + "ValueRange":$privateVars, "ValueRange":$firstprivateVars, + "ValueRange":$lastprivate_vars, "ValueRange":$linear_vars, + "ValueRange":$linear_step_vars, "StringAttr":$schedule_val, + "Value":$schedule_chunk_var, "IntegerAttr":$collapse_val, + "UnitAttr":$nowait, "IntegerAttr":$ordered_val, + "StringAttr":$order_val, "UnitAttr":$inclusive, CArg<"bool", + "true">:$buildBody)>, + OpBuilder<(ins "TypeRange":$resultTypes, "ValueRange":$operands, + CArg<"ArrayRef", "{}">:$attributes)> ]; let regions = (region AnyRegion:$region); @@ -193,6 +207,8 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments]> { /// Returns the number of loops in the workshape loop nest. unsigned getNumLoops() { return lowerBound().size(); } }]; + let parser = [{ return parseWsLoopOp(parser, result); }]; + let printer = [{ return printWsLoopOp(p, *this); }]; } def YieldOp : OpenMP_Op<"yield", [NoSideEffect, ReturnLike, Terminator, diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 907ba65c07b7..06854cd99be1 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -17,6 +17,7 @@ #include "mlir/IR/OperationSupport.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include @@ -172,8 +173,8 @@ static void printParallelOp(OpAsmPrinter &p, ParallelOp op) { } /// Emit an error if the same clause is present more than once on an operation. -static ParseResult allowedOnce(OpAsmParser &parser, llvm::StringRef clause, - llvm::StringRef operation) { +static ParseResult allowedOnce(OpAsmParser &parser, StringRef clause, + StringRef operation) { return parser.emitError(parser.getNameLoc()) << " at most one " << clause << " clause can appear on the " << operation << " operation"; @@ -213,7 +214,7 @@ static ParseResult parseParallelOp(OpAsmParser &parser, SmallVector allocators; SmallVector allocatorTypes; std::array segments{0, 0, 0, 0, 0, 0, 0, 0}; - llvm::StringRef keyword; + StringRef keyword; bool defaultVal = false; bool procBind = false; @@ -225,11 +226,11 @@ static ParseResult parseParallelOp(OpAsmParser &parser, const int copyinClausePos = 5; const int allocateClausePos = 6; const int allocatorPos = 7; - const llvm::StringRef opName = result.name.getStringRef(); + const StringRef opName = result.name.getStringRef(); while (succeeded(parser.parseOptionalKeyword(&keyword))) { if (keyword == "if") { - // Fail if there was already another if condition + // Fail if there was already another if condition. if (segments[ifClausePos]) return allowedOnce(parser, "if", opName); if (parser.parseLParen() || parser.parseOperand(ifCond.first) || @@ -237,7 +238,7 @@ static ParseResult parseParallelOp(OpAsmParser &parser, return failure(); segments[ifClausePos] = 1; } else if (keyword == "num_threads") { - // fail if there was already another num_threads clause + // Fail if there was already another num_threads clause. if (segments[numThreadsClausePos]) return allowedOnce(parser, "num_threads", opName); if (parser.parseLParen() || parser.parseOperand(numThreads.first) || @@ -245,35 +246,35 @@ static ParseResult parseParallelOp(OpAsmParser &parser, return failure(); segments[numThreadsClausePos] = 1; } else if (keyword == "private") { - // fail if there was already another private clause + // Fail if there was already another private clause. if (segments[privateClausePos]) return allowedOnce(parser, "private", opName); if (parseOperandAndTypeList(parser, privates, privateTypes)) return failure(); segments[privateClausePos] = privates.size(); } else if (keyword == "firstprivate") { - // fail if there was already another firstprivate clause + // Fail if there was already another firstprivate clause. if (segments[firstprivateClausePos]) return allowedOnce(parser, "firstprivate", opName); if (parseOperandAndTypeList(parser, firstprivates, firstprivateTypes)) return failure(); segments[firstprivateClausePos] = firstprivates.size(); } else if (keyword == "shared") { - // fail if there was already another shared clause + // Fail if there was already another shared clause. if (segments[sharedClausePos]) return allowedOnce(parser, "shared", opName); if (parseOperandAndTypeList(parser, shareds, sharedTypes)) return failure(); segments[sharedClausePos] = shareds.size(); } else if (keyword == "copyin") { - // fail if there was already another copyin clause + // Fail if there was already another copyin clause. if (segments[copyinClausePos]) return allowedOnce(parser, "copyin", opName); if (parseOperandAndTypeList(parser, copyins, copyinTypes)) return failure(); segments[copyinClausePos] = copyins.size(); } else if (keyword == "allocate") { - // fail if there was already another allocate clause + // Fail if there was already another allocate clause. if (segments[allocateClausePos]) return allowedOnce(parser, "allocate", opName); if (parseAllocateAndAllocator(parser, allocates, allocateTypes, @@ -282,27 +283,27 @@ static ParseResult parseParallelOp(OpAsmParser &parser, segments[allocateClausePos] = allocates.size(); segments[allocatorPos] = allocators.size(); } else if (keyword == "default") { - // fail if there was already another default clause + // Fail if there was already another default clause. if (defaultVal) return allowedOnce(parser, "default", opName); defaultVal = true; - llvm::StringRef defval; + StringRef defval; if (parser.parseLParen() || parser.parseKeyword(&defval) || parser.parseRParen()) return failure(); - llvm::SmallString<16> attrval; + SmallString<16> attrval; // The def prefix is required for the attribute as "private" is a keyword - // in C++ + // in C++. attrval += "def"; attrval += defval; auto attr = parser.getBuilder().getStringAttr(attrval); result.addAttribute("default_val", attr); } else if (keyword == "proc_bind") { - // fail if there was already another proc_bind clause + // Fail if there was already another proc_bind clause. if (procBind) return allowedOnce(parser, "proc_bind", opName); procBind = true; - llvm::StringRef bind; + StringRef bind; if (parser.parseLParen() || parser.parseKeyword(&bind) || parser.parseRParen()) return failure(); @@ -315,48 +316,48 @@ static ParseResult parseParallelOp(OpAsmParser &parser, } } - // Add if parameter + // Add if parameter. if (segments[ifClausePos] && parser.resolveOperand(ifCond.first, ifCond.second, result.operands)) return failure(); - // Add num_threads parameter + // Add num_threads parameter. if (segments[numThreadsClausePos] && parser.resolveOperand(numThreads.first, numThreads.second, result.operands)) return failure(); - // Add private parameters + // Add private parameters. if (segments[privateClausePos] && parser.resolveOperands(privates, privateTypes, privates[0].location, result.operands)) return failure(); - // Add firstprivate parameters + // Add firstprivate parameters. if (segments[firstprivateClausePos] && parser.resolveOperands(firstprivates, firstprivateTypes, firstprivates[0].location, result.operands)) return failure(); - // Add shared parameters + // Add shared parameters. if (segments[sharedClausePos] && parser.resolveOperands(shareds, sharedTypes, shareds[0].location, result.operands)) return failure(); - // Add copyin parameters + // Add copyin parameters. if (segments[copyinClausePos] && parser.resolveOperands(copyins, copyinTypes, copyins[0].location, result.operands)) return failure(); - // Add allocate parameters + // Add allocate parameters. if (segments[allocateClausePos] && parser.resolveOperands(allocates, allocateTypes, allocates[0].location, result.operands)) return failure(); - // Add allocator parameters + // Add allocator parameters. if (segments[allocatorPos] && parser.resolveOperands(allocators, allocatorTypes, allocators[0].location, result.operands)) @@ -373,6 +374,335 @@ static ParseResult parseParallelOp(OpAsmParser &parser, return success(); } +/// linear ::= `linear` `(` linear-list `)` +/// linear-list := linear-val | linear-val linear-list +/// linear-val := ssa-id-and-type `=` ssa-id-and-type +static ParseResult +parseLinearClause(OpAsmParser &parser, + SmallVectorImpl &vars, + SmallVectorImpl &types, + SmallVectorImpl &stepVars) { + if (parser.parseLParen()) + return failure(); + + do { + OpAsmParser::OperandType var; + Type type; + OpAsmParser::OperandType stepVar; + if (parser.parseOperand(var) || parser.parseEqual() || + parser.parseOperand(stepVar) || parser.parseColonType(type)) + return failure(); + + vars.push_back(var); + types.push_back(type); + stepVars.push_back(stepVar); + } while (succeeded(parser.parseOptionalComma())); + + if (parser.parseRParen()) + return failure(); + + return success(); +} + +/// schedule ::= `schedule` `(` sched-list `)` +/// sched-list ::= sched-val | sched-val sched-list +/// sched-val ::= sched-with-chunk | sched-wo-chunk +/// sched-with-chunk ::= sched-with-chunk-types (`=` ssa-id-and-type)? +/// sched-with-chunk-types ::= `static` | `dynamic` | `guided` +/// sched-wo-chunk ::= `auto` | `runtime` +static ParseResult +parseScheduleClause(OpAsmParser &parser, SmallString<8> &schedule, + Optional &chunkSize) { + if (parser.parseLParen()) + return failure(); + + StringRef keyword; + if (parser.parseKeyword(&keyword)) + return failure(); + + schedule = keyword; + if (keyword == "static" || keyword == "dynamic" || keyword == "guided") { + if (succeeded(parser.parseOptionalEqual())) { + chunkSize = OpAsmParser::OperandType{}; + if (parser.parseOperand(*chunkSize)) + return failure(); + } else { + chunkSize = llvm::NoneType::None; + } + } else if (keyword == "auto" || keyword == "runtime") { + chunkSize = llvm::NoneType::None; + } else { + return parser.emitError(parser.getNameLoc()) << " expected schedule kind"; + } + + if (parser.parseRParen()) + return failure(); + + return success(); +} + +/// Parses an OpenMP Workshare Loop operation +/// +/// operation ::= `omp.wsloop` loop-control clause-list +/// loop-control ::= `(` ssa-id-list `)` `:` type `=` loop-bounds +/// loop-bounds := `(` ssa-id-list `)` to `(` ssa-id-list `)` steps +/// steps := `step` `(`ssa-id-list`)` +/// clause-list ::= clause | empty | clause-list +/// clause ::= private | firstprivate | lastprivate | linear | schedule | +// collapse | nowait | ordered | order | inclusive +/// private ::= `private` `(` ssa-id-and-type-list `)` +/// firstprivate ::= `firstprivate` `(` ssa-id-and-type-list `)` +/// lastprivate ::= `lastprivate` `(` ssa-id-and-type-list `)` +/// linear ::= `linear` `(` linear-list `)` +/// schedule ::= `schedule` `(` sched-list `)` +/// collapse ::= `collapse` `(` ssa-id-and-type `)` +/// nowait ::= `nowait` +/// ordered ::= `ordered` `(` ssa-id-and-type `)` +/// order ::= `order` `(` `concurrent` `)` +/// inclusive ::= `inclusive` +/// +static ParseResult parseWsLoopOp(OpAsmParser &parser, OperationState &result) { + Type loopVarType; + int numIVs; + + // Parse an opening `(` followed by induction variables followed by `)` + SmallVector ivs; + if (parser.parseRegionArgumentList(ivs, /*requiredOperandCount=*/-1, + OpAsmParser::Delimiter::Paren)) + return failure(); + + numIVs = static_cast(ivs.size()); + + if (parser.parseColonType(loopVarType)) + return failure(); + + // Parse loop bounds. + SmallVector lower; + if (parser.parseEqual() || + parser.parseOperandList(lower, numIVs, OpAsmParser::Delimiter::Paren) || + parser.resolveOperands(lower, loopVarType, result.operands)) + return failure(); + + SmallVector upper; + if (parser.parseKeyword("to") || + parser.parseOperandList(upper, numIVs, OpAsmParser::Delimiter::Paren) || + parser.resolveOperands(upper, loopVarType, result.operands)) + return failure(); + + // Parse step values. + SmallVector steps; + if (parser.parseKeyword("step") || + parser.parseOperandList(steps, numIVs, OpAsmParser::Delimiter::Paren) || + parser.resolveOperands(steps, loopVarType, result.operands)) + return failure(); + + SmallVector privates; + SmallVector privateTypes; + SmallVector firstprivates; + SmallVector firstprivateTypes; + SmallVector lastprivates; + SmallVector lastprivateTypes; + SmallVector linears; + SmallVector linearTypes; + SmallVector linearSteps; + SmallString<8> schedule; + Optional scheduleChunkSize; + std::array segments{numIVs, numIVs, numIVs, 0, 0, 0, 0, 0, 0}; + + const StringRef opName = result.name.getStringRef(); + StringRef keyword; + + enum SegmentPos { + lbPos = 0, + ubPos, + stepPos, + privateClausePos, + firstprivateClausePos, + lastprivateClausePos, + linearClausePos, + linearStepPos, + scheduleClausePos, + }; + + while (succeeded(parser.parseOptionalKeyword(&keyword))) { + if (keyword == "private") { + if (segments[privateClausePos]) + return allowedOnce(parser, "private", opName); + if (parseOperandAndTypeList(parser, privates, privateTypes)) + return failure(); + segments[privateClausePos] = privates.size(); + } else if (keyword == "firstprivate") { + // fail if there was already another firstprivate clause + if (segments[firstprivateClausePos]) + return allowedOnce(parser, "firstprivate", opName); + if (parseOperandAndTypeList(parser, firstprivates, firstprivateTypes)) + return failure(); + segments[firstprivateClausePos] = firstprivates.size(); + } else if (keyword == "lastprivate") { + // fail if there was already another shared clause + if (segments[lastprivateClausePos]) + return allowedOnce(parser, "lastprivate", opName); + if (parseOperandAndTypeList(parser, lastprivates, lastprivateTypes)) + return failure(); + segments[lastprivateClausePos] = lastprivates.size(); + } else if (keyword == "linear") { + // fail if there was already another linear clause + if (segments[linearClausePos]) + return allowedOnce(parser, "linear", opName); + if (parseLinearClause(parser, linears, linearTypes, linearSteps)) + return failure(); + segments[linearClausePos] = linears.size(); + segments[linearStepPos] = linearSteps.size(); + } else if (keyword == "schedule") { + if (!schedule.empty()) + return allowedOnce(parser, "schedule", opName); + if (parseScheduleClause(parser, schedule, scheduleChunkSize)) + return failure(); + if (scheduleChunkSize) { + segments[scheduleClausePos] = 1; + } + } else if (keyword == "collapse") { + auto type = parser.getBuilder().getI64Type(); + mlir::IntegerAttr attr; + if (parser.parseLParen() || parser.parseAttribute(attr, type) || + parser.parseRParen()) + return failure(); + result.addAttribute("collapse_val", attr); + } else if (keyword == "nowait") { + auto attr = UnitAttr::get(parser.getBuilder().getContext()); + result.addAttribute("nowait", attr); + } else if (keyword == "ordered") { + mlir::IntegerAttr attr; + if (succeeded(parser.parseOptionalLParen())) { + auto type = parser.getBuilder().getI64Type(); + if (parser.parseAttribute(attr, type)) + return failure(); + if (parser.parseRParen()) + return failure(); + } else { + // Use 0 to represent no ordered parameter was specified + attr = parser.getBuilder().getI64IntegerAttr(0); + } + result.addAttribute("ordered_val", attr); + } else if (keyword == "order") { + StringRef order; + if (parser.parseLParen() || parser.parseKeyword(&order) || + parser.parseRParen()) + return failure(); + auto attr = parser.getBuilder().getStringAttr(order); + result.addAttribute("order", attr); + } else if (keyword == "inclusive") { + auto attr = UnitAttr::get(parser.getBuilder().getContext()); + result.addAttribute("inclusive", attr); + } + } + + if (segments[privateClausePos]) { + parser.resolveOperands(privates, privateTypes, privates[0].location, + result.operands); + } + + if (segments[firstprivateClausePos]) { + parser.resolveOperands(firstprivates, firstprivateTypes, + firstprivates[0].location, result.operands); + } + + if (segments[lastprivateClausePos]) { + parser.resolveOperands(lastprivates, lastprivateTypes, + lastprivates[0].location, result.operands); + } + + if (segments[linearClausePos]) { + parser.resolveOperands(linears, linearTypes, linears[0].location, + result.operands); + auto linearStepType = parser.getBuilder().getI32Type(); + SmallVector linearStepTypes(linearSteps.size(), linearStepType); + parser.resolveOperands(linearSteps, linearStepTypes, + linearSteps[0].location, result.operands); + } + + if (!schedule.empty()) { + schedule[0] = llvm::toUpper(schedule[0]); + auto attr = parser.getBuilder().getStringAttr(schedule); + result.addAttribute("schedule_val", attr); + if (scheduleChunkSize) { + auto chunkSizeType = parser.getBuilder().getI32Type(); + parser.resolveOperand(*scheduleChunkSize, chunkSizeType, result.operands); + } + } + + result.addAttribute("operand_segment_sizes", + parser.getBuilder().getI32VectorAttr(segments)); + + // Now parse the body. + Region *body = result.addRegion(); + SmallVector ivTypes(numIVs, loopVarType); + if (parser.parseRegion(*body, ivs, ivTypes)) + return failure(); + return success(); +} + +static void printWsLoopOp(OpAsmPrinter &p, WsLoopOp op) { + auto args = op.getRegion().front().getArguments(); + p << op.getOperationName() << " (" << args << ") : " << args[0].getType() + << " = (" << op.lowerBound() << ") to (" << op.upperBound() << ") step (" + << op.step() << ")"; + + // Print private, firstprivate, shared and copyin parameters + auto printDataVars = [&p](StringRef name, OperandRange vars) { + if (vars.empty()) + return; + + p << " " << name << "("; + llvm::interleaveComma( + vars, p, [&](const Value &v) { p << v << " : " << v.getType(); }); + p << ")"; + }; + printDataVars("private", op.private_vars()); + printDataVars("firstprivate", op.firstprivate_vars()); + printDataVars("lastprivate", op.lastprivate_vars()); + + auto linearVars = op.linear_vars(); + auto linearVarsSize = linearVars.size(); + if (linearVarsSize) { + p << " " + << "linear" + << "("; + for (unsigned i = 0; i < linearVarsSize; ++i) { + std::string separator = i == linearVarsSize - 1 ? ")" : ", "; + p << linearVars[i]; + if (op.linear_step_vars().size() > i) + p << " = " << op.linear_step_vars()[i]; + p << " : " << linearVars[i].getType() << separator; + } + } + + if (auto sched = op.schedule_val()) { + auto schedLower = sched->lower(); + p << " schedule(" << schedLower; + if (auto chunk = op.schedule_chunk_var()) { + p << " = " << chunk; + } + p << ")"; + } + + if (auto collapse = op.collapse_val()) + p << " collapse(" << collapse << ")"; + + if (op.nowait()) + p << " nowait"; + + if (auto ordered = op.ordered_val()) { + p << " ordered(" << ordered << ")"; + } + + if (op.inclusive()) { + p << " inclusive"; + } + + p.printRegion(op.region(), /*printEntryBlockArgs=*/false); +} + //===----------------------------------------------------------------------===// // WsLoopOp //===----------------------------------------------------------------------===// @@ -386,9 +716,71 @@ void WsLoopOp::build(OpBuilder &builder, OperationState &state, /*linear_vars=*/ValueRange(), /*linear_step_vars=*/ValueRange(), /*schedule_val=*/nullptr, /*schedule_chunk_var=*/nullptr, /*collapse_val=*/nullptr, - /*nowait=*/false, /*ordered_val=*/nullptr, /*order_val=*/nullptr, - /*inclusive=*/false); + /*nowait=*/nullptr, /*ordered_val=*/nullptr, /*order_val=*/nullptr, + /*inclusive=*/nullptr, /*buildBody=*/false); + state.addAttributes(attributes); +} + +void WsLoopOp::build(OpBuilder &, OperationState &state, TypeRange resultTypes, + ValueRange operands, ArrayRef attributes) { + state.addOperands(operands); state.addAttributes(attributes); + (void)state.addRegion(); + assert(resultTypes.size() == 0u && "mismatched number of return types"); + state.addTypes(resultTypes); +} + +void WsLoopOp::build(OpBuilder &builder, OperationState &result, + TypeRange typeRange, ValueRange lowerBounds, + ValueRange upperBounds, ValueRange steps, + ValueRange privateVars, ValueRange firstprivateVars, + ValueRange lastprivateVars, ValueRange linearVars, + ValueRange linearStepVars, StringAttr scheduleVal, + Value scheduleChunkVar, IntegerAttr collapseVal, + UnitAttr nowait, IntegerAttr orderedVal, + StringAttr orderVal, UnitAttr inclusive, bool buildBody) { + result.addOperands(lowerBounds); + result.addOperands(upperBounds); + result.addOperands(steps); + result.addOperands(privateVars); + result.addOperands(firstprivateVars); + result.addOperands(linearVars); + result.addOperands(linearStepVars); + if (scheduleChunkVar) + result.addOperands(scheduleChunkVar); + + if (scheduleVal) + result.addAttribute("schedule_val", scheduleVal); + if (collapseVal) + result.addAttribute("collapse_val", collapseVal); + if (nowait) + result.addAttribute("nowait", nowait); + if (orderedVal) + result.addAttribute("ordered_val", orderedVal); + if (orderVal) + result.addAttribute("order", orderVal); + if (inclusive) + result.addAttribute("inclusive", inclusive); + result.addAttribute( + WsLoopOp::getOperandSegmentSizeAttr(), + builder.getI32VectorAttr( + {static_cast(lowerBounds.size()), + static_cast(upperBounds.size()), + static_cast(steps.size()), + static_cast(privateVars.size()), + static_cast(firstprivateVars.size()), + static_cast(lastprivateVars.size()), + static_cast(linearVars.size()), + static_cast(linearStepVars.size()), + static_cast(scheduleChunkVar != nullptr ? 1 : 0)})); + + Region *bodyRegion = result.addRegion(); + if (buildBody) { + OpBuilder::InsertionGuard guard(builder); + unsigned numIVs = steps.size(); + SmallVector argTypes(numIVs, steps.getType().front()); + builder.createBlock(bodyRegion, {}, argTypes); + } } #define GET_OP_CLASSES diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir index c1fc82e51c50..e0bb0134a14a 100644 --- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir +++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir @@ -34,10 +34,8 @@ func @branch_loop() { func @wsloop(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { // CHECK: omp.parallel omp.parallel { - // CHECK: omp.wsloop - // CHECK: (%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]], %[[ARG5]]) + // CHECK: omp.wsloop (%[[ARG6:.*]], %[[ARG7:.*]]) : i64 = (%[[ARG0]], %[[ARG1]]) to (%[[ARG2]], %[[ARG3]]) step (%[[ARG4]], %[[ARG5]]) { "omp.wsloop"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) ( { - // CHECK: ^{{.*}}(%[[ARG6:.*]]: i64, %[[ARG7:.*]]: i64): ^bb0(%arg6: index, %arg7: index): // no predecessors // CHECK: "test.payload"(%[[ARG6]], %[[ARG7]]) : (i64, i64) -> () "test.payload"(%arg6, %arg7) : (index, index) -> () diff --git a/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir b/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir index 466bd6aa96af..60a143a85006 100644 --- a/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir +++ b/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir @@ -4,9 +4,9 @@ func @parallel(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { // CHECK: omp.parallel { - // CHECK: "omp.wsloop"({{.*}}) ( { + // CHECK: omp.wsloop (%[[LVAR1:.*]], %[[LVAR2:.*]]) : index = (%arg0, %arg1) to (%arg2, %arg3) step (%arg4, %arg5) { scf.parallel (%i, %j) = (%arg0, %arg1) to (%arg2, %arg3) step (%arg4, %arg5) { - // CHECK: test.payload + // CHECK: "test.payload"(%[[LVAR1]], %[[LVAR2]]) : (index, index) -> () "test.payload"(%i, %j) : (index, index) -> () // CHECK: omp.yield // CHECK: } @@ -20,12 +20,12 @@ func @parallel(%arg0: index, %arg1: index, %arg2: index, func @nested_loops(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { // CHECK: omp.parallel { - // CHECK: "omp.wsloop"({{.*}}) ( { + // CHECK: omp.wsloop (%[[LVAR_OUT1:.*]]) : index = (%arg0) to (%arg2) step (%arg4) { // CHECK-NOT: omp.parallel scf.parallel (%i) = (%arg0) to (%arg2) step (%arg4) { - // CHECK: "omp.wsloop"({{.*}}) ( { + // CHECK: omp.wsloop (%[[LVAR_IN1:.*]]) : index = (%arg1) to (%arg3) step (%arg5) { scf.parallel (%j) = (%arg1) to (%arg3) step (%arg5) { - // CHECK: test.payload + // CHECK: "test.payload"(%[[LVAR_OUT1]], %[[LVAR_IN1]]) : (index, index) -> () "test.payload"(%i, %j) : (index, index) -> () // CHECK: omp.yield // CHECK: } @@ -41,9 +41,9 @@ func @nested_loops(%arg0: index, %arg1: index, %arg2: index, func @adjacent_loops(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index) { // CHECK: omp.parallel { - // CHECK: "omp.wsloop"({{.*}}) ( { + // CHECK: omp.wsloop (%[[LVAR_AL1:.*]]) : index = (%arg0) to (%arg2) step (%arg4) { scf.parallel (%i) = (%arg0) to (%arg2) step (%arg4) { - // CHECK: test.payload1 + // CHECK: "test.payload1"(%[[LVAR_AL1]]) : (index) -> () "test.payload1"(%i) : (index) -> () // CHECK: omp.yield // CHECK: } @@ -52,9 +52,9 @@ func @adjacent_loops(%arg0: index, %arg1: index, %arg2: index, // CHECK: } // CHECK: omp.parallel { - // CHECK: "omp.wsloop"({{.*}}) ( { + // CHECK: omp.wsloop (%[[LVAR_AL2:.*]]) : index = (%arg1) to (%arg3) step (%arg5) { scf.parallel (%j) = (%arg1) to (%arg3) step (%arg5) { - // CHECK: test.payload2 + // CHECK: "test.payload2"(%[[LVAR_AL2]]) : (index) -> () "test.payload2"(%j) : (index) -> () // CHECK: omp.yield // CHECK: } diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir index 6b9be10c7693..8f7f9c1ca69c 100644 --- a/mlir/test/Dialect/OpenMP/ops.mlir +++ b/mlir/test/Dialect/OpenMP/ops.mlir @@ -89,77 +89,192 @@ func @omp_parallel(%data_var : memref, %if_cond : i1, %num_threads : si32) } func @omp_parallel_pretty(%data_var : memref, %if_cond : i1, %num_threads : si32, %allocator : si32) -> () { - // CHECK: omp.parallel - omp.parallel { - omp.terminator - } + // CHECK: omp.parallel + omp.parallel { + omp.terminator + } + + // CHECK: omp.parallel num_threads(%{{.*}} : si32) + omp.parallel num_threads(%num_threads : si32) { + omp.terminator + } + + // CHECK: omp.parallel allocate(%{{.*}} : memref -> %{{.*}} : memref) + omp.parallel allocate(%data_var : memref -> %data_var : memref) { + omp.terminator + } + + // CHECK: omp.parallel private(%{{.*}} : memref, %{{.*}} : memref) firstprivate(%{{.*}} : memref) + omp.parallel private(%data_var : memref, %data_var : memref) firstprivate(%data_var : memref) { + omp.terminator + } + + // CHECK omp.parallel shared(%{{.*}} : memref) copyin(%{{.*}} : memref, %{{.*}} : memref) + omp.parallel shared(%data_var : memref) copyin(%data_var : memref, %data_var : memref) { + omp.parallel if(%if_cond: i1) { + omp.terminator + } + omp.terminator + } + + // CHECK omp.parallel if(%{{.*}}) num_threads(%{{.*}} : si32) private(%{{.*}} : memref) proc_bind(close) + omp.parallel num_threads(%num_threads : si32) if(%if_cond: i1) + private(%data_var : memref) proc_bind(close) { + omp.terminator + } + + return +} - // CHECK: omp.parallel num_threads(%{{.*}} : si32) - omp.parallel num_threads(%num_threads : si32) { - omp.terminator - } +// CHECK-LABEL: omp_wsloop +func @omp_wsloop(%lb : index, %ub : index, %step : index, %data_var : memref, %linear_var : i32, %chunk_var : i32) -> () { - // CHECK: omp.parallel allocate(%{{.*}} : memref -> %{{.*}} : memref) - omp.parallel allocate(%data_var : memref -> %data_var : memref) { - omp.terminator + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref, %{{.*}} : memref) collapse(2) ordered(1) + "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var) ({ + ^bb0(%iv: index): + omp.yield + }) {operand_segment_sizes = dense<[1,1,1,2,0,0,0,0,0]> : vector<9xi32>, collapse_val = 2, ordered_val = 1} : + (index, index, index, memref, memref) -> () + + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref) schedule(static) + "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var) ({ + ^bb0(%iv: index): + omp.yield + }) {operand_segment_sizes = dense<[1,1,1,0,0,0,1,1,0]> : vector<9xi32>, schedule_val = "Static"} : + (index, index, index, memref, i32) -> () + + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref, %{{.*}} = %{{.*}} : memref) schedule(static) + "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %linear_var, %linear_var) ({ + ^bb0(%iv: index): + omp.yield + }) {operand_segment_sizes = dense<[1,1,1,0,0,0,2,2,0]> : vector<9xi32>, schedule_val = "Static"} : + (index, index, index, memref, memref, i32, i32) -> () + + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(dynamic = %{{.*}}) collapse(3) ordered(2) + "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %data_var, %data_var, %linear_var, %chunk_var) ({ + ^bb0(%iv: index): + omp.yield + }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1,1]> : vector<9xi32>, schedule_val = "Dynamic", collapse_val = 3, ordered_val = 2} : + (index, index, index, memref, memref, memref, memref, i32, i32) -> () + + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) schedule(auto) nowait + "omp.wsloop" (%lb, %ub, %step, %data_var) ({ + ^bb0(%iv: index): + omp.yield + }) {operand_segment_sizes = dense<[1,1,1,1,0,0,0,0,0]> : vector<9xi32>, nowait, schedule_val = "Auto"} : + (index, index, index, memref) -> () + + return +} + +// CHECK-LABEL: omp_wsloop_pretty +func @omp_wsloop_pretty(%lb : index, %ub : index, %step : index, + %data_var : memref, %linear_var : i32, %chunk_var : i32) -> () { + + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) private(%data_var : memref) collapse(2) ordered(2) { + omp.yield } - // CHECK: omp.parallel private(%{{.*}} : memref, %{{.*}} : memref) firstprivate(%{{.*}} : memref) - omp.parallel private(%data_var : memref, %data_var : memref) firstprivate(%data_var : memref) { - omp.terminator + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) linear(%{{.*}} = %{{.*}} : memref) schedule(static) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) schedule(static) lastprivate(%data_var : memref) linear(%data_var = %linear_var : memref) { + omp.yield } - // CHECK omp.parallel shared(%{{.*}} : memref) copyin(%{{.*}} : memref, %{{.*}} : memref) - omp.parallel shared(%data_var : memref) copyin(%data_var : memref, %data_var : memref) { - omp.parallel if(%if_cond: i1) { - omp.terminator - } - omp.terminator + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private(%{{.*}} : memref) firstprivate(%{{.*}} : memref) lastprivate(%{{.*}} : memref) linear(%{{.*}} = %{{.*}} : memref) schedule(static = %{{.*}}) collapse(3) ordered(2) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) ordered(2) private(%data_var : memref) + firstprivate(%data_var : memref) lastprivate(%data_var : memref) linear(%data_var = %linear_var : memref) + schedule(static = %chunk_var) collapse(3) { + omp.yield } - // CHECK omp.parallel if(%{{.*}}) num_threads(%{{.*}} : si32) private(%{{.*}} : memref) proc_bind(close) - omp.parallel num_threads(%num_threads : si32) if(%if_cond: i1) - private(%data_var : memref) proc_bind(close) { - omp.terminator + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) private({{.*}} : memref) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) private(%data_var : memref) { + omp.yield } return } -func @omp_wsloop(%lb : index, %ub : index, %step : index, - %data_var : memref, %linear_var : si32, %chunk_var : si32) -> () { +// CHECK-LABEL: omp_wsloop_pretty_multi_block +func @omp_wsloop_pretty_multi_block(%lb : index, %ub : index, %step : index, %data1 : memref, %data2 : memref) -> () { - // CHECK: "omp.wsloop"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) - "omp.wsloop" (%lb, %ub, %step, %data_var) ({ + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) { + %1 = "test.payload"(%iv) : (index) -> (i32) + br ^bb1(%1: i32) + ^bb1(%arg: i32): + memref.store %arg, %data1[%iv] : memref omp.yield - }) {operand_segment_sizes = dense<[1,1,1,1,0,0,0,0,0]> : vector<9xi32>, collapse_val = 2, ordered_val = 1} : - (index, index, index, memref) -> () + } - // CHECK: "omp.wsloop"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) - "omp.wsloop" (%lb, %lb, %ub, %ub, %step, %step, %data_var) ({ + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) { + %c = "test.condition"(%iv) : (index) -> (i1) + %v1 = "test.payload"(%iv) : (index) -> (i32) + cond_br %c, ^bb1(%v1: i32), ^bb2(%v1: i32) + ^bb1(%arg0: i32): + memref.store %arg0, %data1[%iv] : memref + br ^bb3 + ^bb2(%arg1: i32): + memref.store %arg1, %data2[%iv] : memref + br ^bb3 + ^bb3: omp.yield - }) {operand_segment_sizes = dense<[2,2,2,1,0,0,0,0,0]> : vector<9xi32>, collapse_val = 2, ordered_val = 1} : - (index, index, index, index, index, index, memref) -> () - + } - // CHECK: "omp.wsloop"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) - "omp.wsloop" (%lb, %ub, %step, %data_var, %linear_var) ({ + // CHECK: omp.wsloop (%{{.*}}) : index = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) + omp.wsloop (%iv) : index = (%lb) to (%ub) step (%step) { + %c = "test.condition"(%iv) : (index) -> (i1) + %v1 = "test.payload"(%iv) : (index) -> (i32) + cond_br %c, ^bb1(%v1: i32), ^bb2(%v1: i32) + ^bb1(%arg0: i32): + memref.store %arg0, %data1[%iv] : memref omp.yield - }) {operand_segment_sizes = dense<[1,1,1,0,0,0,1,1,0]> : vector<9xi32>, schedule_val = "Static"} : - (index, index, index, memref, si32) -> () + ^bb2(%arg1: i32): + memref.store %arg1, %data2[%iv] : memref + omp.yield + } - // CHECK: "omp.wsloop"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) - "omp.wsloop" (%lb, %ub, %step, %data_var, %data_var, %data_var, %data_var, %linear_var, %chunk_var) ({ + return +} + +// CHECK-LABEL: omp_wsloop_pretty_non_index +func @omp_wsloop_pretty_non_index(%lb1 : i32, %ub1 : i32, %step1 : i32, %lb2 : i64, %ub2 : i64, %step2 : i64, + %data1 : memref, %data2 : memref) -> () { + + // CHECK: omp.wsloop (%{{.*}}) : i32 = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) + omp.wsloop (%iv1) : i32 = (%lb1) to (%ub1) step (%step1) { + %1 = "test.payload"(%iv1) : (i32) -> (index) + br ^bb1(%1: index) + ^bb1(%arg1: index): + memref.store %iv1, %data1[%arg1] : memref omp.yield - }) {operand_segment_sizes = dense<[1,1,1,1,1,1,1,1,1]> : vector<9xi32>, schedule_val = "Dynamic", collapse_val = 3, ordered_val = 2} : - (index, index, index, memref, memref, memref, memref, si32, si32) -> () + } - // CHECK: "omp.wsloop"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) - "omp.wsloop" (%lb, %ub, %step, %data_var) ({ + // CHECK: omp.wsloop (%{{.*}}) : i64 = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) + omp.wsloop (%iv2) : i64 = (%lb2) to (%ub2) step (%step2) { + %2 = "test.payload"(%iv2) : (i64) -> (index) + br ^bb1(%2: index) + ^bb1(%arg2: index): + memref.store %iv2, %data2[%arg2] : memref omp.yield - }) {operand_segment_sizes = dense<[1,1,1,1,0,0,0,0,0]> : vector<9xi32>, nowait, schedule_val = "Auto"} : - (index, index, index, memref) -> () + } + + return +} +// CHECK-LABEL: omp_wsloop_pretty_multiple +func @omp_wsloop_pretty_multiple(%lb1 : i32, %ub1 : i32, %step1 : i32, %lb2 : i32, %ub2 : i32, %step2 : i32, %data1 : memref) -> () { + + // CHECK: omp.wsloop (%{{.*}}, %{{.*}}) : i32 = (%{{.*}}, %{{.*}}) to (%{{.*}}, %{{.*}}) step (%{{.*}}, %{{.*}}) + omp.wsloop (%iv1, %iv2) : i32 = (%lb1, %lb2) to (%ub1, %ub2) step (%step1, %step2) { + %1 = "test.payload"(%iv1) : (i32) -> (index) + %2 = "test.payload"(%iv2) : (i32) -> (index) + memref.store %iv1, %data1[%1] : memref + memref.store %iv2, %data1[%2] : memref + omp.yield + } return } -- GitLab From 078b338ba67a26809bad682bbd5617718e2f655b Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 18 Mar 2021 14:00:07 +0000 Subject: [PATCH 0038/1000] [AMDGPU] Add some gfx1010 test coverage. NFC. --- llvm/test/CodeGen/AMDGPU/rel32.ll | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/rel32.ll b/llvm/test/CodeGen/AMDGPU/rel32.ll index 4e9878eea23a..44e3b54d0e67 100644 --- a/llvm/test/CodeGen/AMDGPU/rel32.ll +++ b/llvm/test/CodeGen/AMDGPU/rel32.ll @@ -1,11 +1,12 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s @g = protected local_unnamed_addr addrspace(4) externally_initialized global i32 0, align 4 ; CHECK-LABEL: rel32_neg_offset: ; CHECK: s_getpc_b64 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{]}} -; CHECK: s_add_u32 s[[LO]], s[[LO]], g@rel32@lo-4 -; CHECK: s_addc_u32 s[[HI]], s[[HI]], g@rel32@hi+4 +; CHECK-NEXT: s_add_u32 s[[LO]], s[[LO]], g@rel32@lo-4 +; CHECK-NEXT: s_addc_u32 s[[HI]], s[[HI]], g@rel32@hi+4 define i32 addrspace(4)* @rel32_neg_offset() { %r = getelementptr i32, i32 addrspace(4)* @g, i64 -2 ret i32 addrspace(4)* %r -- GitLab From e6ce0db378473c1d264152f370af719903b98bf8 Mon Sep 17 00:00:00 2001 From: Andrew Savonichev Date: Fri, 12 Mar 2021 16:50:38 +0300 Subject: [PATCH 0039/1000] [MCA] Ensure that writes occur in-order Delay the issue of a new instruction if that leads to out-of-order commits of writes. This patch fixes the problem described in: https://bugs.llvm.org/show_bug.cgi?id=41796#c3 Differential Revision: https://reviews.llvm.org/D98604 --- .../llvm/MCA/Stages/InOrderIssueStage.h | 7 +- llvm/lib/MCA/Stages/InOrderIssueStage.cpp | 41 +++++++++++ .../llvm-mca/AArch64/Cortex/A55-all-stats.s | 2 +- .../llvm-mca/AArch64/Cortex/A55-all-views.s | 10 +-- .../AArch64/Cortex/A55-in-order-retire.s | 68 ++++++++++--------- 5 files changed, 89 insertions(+), 39 deletions(-) diff --git a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h index 0b4ea99d06db..867a6c1df3c5 100644 --- a/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h +++ b/llvm/include/llvm/MCA/Stages/InOrderIssueStage.h @@ -50,6 +50,11 @@ class InOrderIssueStage final : public Stage { /// Number of instructions that can be issued in the current cycle. unsigned Bandwidth; + /// Number of cycles (counted from the current cycle) until the last write is + /// committed. This is taken into account to ensure that writes commit in the + /// program order. + unsigned LastWriteBackCycle; + InOrderIssueStage(const InOrderIssueStage &Other) = delete; InOrderIssueStage &operator=(const InOrderIssueStage &Other) = delete; @@ -69,7 +74,7 @@ public: const MCSchedModel &SM, const MCSubtargetInfo &STI) : SM(SM), STI(STI), RCU(RCU), PRF(PRF), RM(std::make_unique(SM)), NumIssued(0), - StallCyclesLeft(0), Bandwidth(0) {} + StallCyclesLeft(0), Bandwidth(0), LastWriteBackCycle(0) {} bool isAvailable(const InstRef &) const override; bool hasWorkToComplete() const override; diff --git a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp index a675b92e1068..dd2270d3a8f3 100644 --- a/llvm/lib/MCA/Stages/InOrderIssueStage.cpp +++ b/llvm/lib/MCA/Stages/InOrderIssueStage.cpp @@ -57,6 +57,32 @@ static bool hasResourceHazard(const ResourceManager &RM, const InstRef &IR) { return false; } +static unsigned findLastWriteBackCycle(const InstRef &IR) { + unsigned LastWBCycle = 0; + for (const WriteState &WS : IR.getInstruction()->getDefs()) { + int CyclesLeft = WS.getCyclesLeft(); + if (CyclesLeft == UNKNOWN_CYCLES) + CyclesLeft = WS.getLatency(); + if (CyclesLeft < 0) + CyclesLeft = 0; + LastWBCycle = std::max(LastWBCycle, (unsigned)CyclesLeft); + } + return LastWBCycle; +} + +static unsigned findFirstWriteBackCycle(const InstRef &IR) { + unsigned FirstWBCycle = ~0U; + for (const WriteState &WS : IR.getInstruction()->getDefs()) { + int CyclesLeft = WS.getCyclesLeft(); + if (CyclesLeft == UNKNOWN_CYCLES) + CyclesLeft = WS.getLatency(); + if (CyclesLeft < 0) + CyclesLeft = 0; + FirstWBCycle = std::min(FirstWBCycle, (unsigned)CyclesLeft); + } + return FirstWBCycle; +} + /// Return a number of cycles left until register requirements of the /// instructions are met. static unsigned checkRegisterHazard(const RegisterFile &PRF, @@ -116,6 +142,14 @@ bool InOrderIssueStage::canExecute(const InstRef &IR, HWStallEvent(HWStallEvent::DispatchGroupStall, IR)); notifyEvent( HWPressureEvent(HWPressureEvent::RESOURCES, IR)); + } else if (LastWriteBackCycle) { + if (!IR.getInstruction()->getDesc().RetireOOO) { + unsigned NextWriteBackCycle = findFirstWriteBackCycle(IR); + // Delay the instruction to ensure that writes occur in program order + if (NextWriteBackCycle < LastWriteBackCycle) { + *StallCycles = LastWriteBackCycle - NextWriteBackCycle; + } + } } return *StallCycles == 0; @@ -213,6 +247,9 @@ llvm::Error InOrderIssueStage::tryIssue(InstRef &IR, unsigned *StallCycles) { IssuedInst.push_back(IR); ++NumIssued; + if (!IR.getInstruction()->getDesc().RetireOOO) + LastWriteBackCycle = findLastWriteBackCycle(IR); + return llvm::ErrorSuccess(); } @@ -285,6 +322,10 @@ llvm::Error InOrderIssueStage::cycleStart() { llvm::Error InOrderIssueStage::cycleEnd() { if (StallCyclesLeft > 0) --StallCyclesLeft; + + if (LastWriteBackCycle > 0) + --LastWriteBackCycle; + return llvm::ErrorSuccess(); } diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s index 35149b09f66f..a672c8c879ae 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-stats.s @@ -35,7 +35,7 @@ str w0, [x21, x18, lsl #2] # CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RAT - Register unavailable: 8 (38.1%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s index f6b3f622a38b..1d4e41a63c63 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-all-views.s @@ -35,7 +35,7 @@ str w0, [x21, x18, lsl #2] # CHECK-NEXT: 1 4 1.00 * str w0, [x21, x18, lsl #2] # CHECK: Dynamic Dispatch Stall Cycles: -# CHECK-NEXT: RAT - Register unavailable: 10 (47.6%) +# CHECK-NEXT: RAT - Register unavailable: 8 (38.1%) # CHECK-NEXT: RCU - Retire tokens unavailable: 0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 @@ -106,13 +106,13 @@ str w0, [x21, x18, lsl #2] # CHECK: [0,0] DeeER. . . . ldr w4, [x2], #4 # CHECK-NEXT: [0,1] .DeeER . . . ldr w5, [x3] # CHECK-NEXT: [0,2] . DeeeER. . . madd w0, w5, w4, w0 -# CHECK-NEXT: [0,3] . DeeE-R. . . add x3, x3, x13 +# CHECK-NEXT: [0,3] . DeeER. . . add x3, x3, x13 # CHECK-NEXT: [0,4] . DeeER. . . subs x1, x1, #1 # CHECK-NEXT: [0,5] . . DeeeER . . str w0, [x21, x18, lsl #2] # CHECK-NEXT: [1,0] . . DeeER . . ldr w4, [x2], #4 # CHECK-NEXT: [1,1] . . DeeER . . ldr w5, [x3] # CHECK-NEXT: [1,2] . . . DeeeER . madd w0, w5, w4, w0 -# CHECK-NEXT: [1,3] . . . DeeE-R . add x3, x3, x13 +# CHECK-NEXT: [1,3] . . . DeeER . add x3, x3, x13 # CHECK-NEXT: [1,4] . . . DeeER . subs x1, x1, #1 # CHECK-NEXT: [1,5] . . . DeeeER str w0, [x21, x18, lsl #2] @@ -126,7 +126,7 @@ str w0, [x21, x18, lsl #2] # CHECK-NEXT: 0. 2 0.0 0.0 0.0 ldr w4, [x2], #4 # CHECK-NEXT: 1. 2 0.0 0.0 0.0 ldr w5, [x3] # CHECK-NEXT: 2. 2 0.0 0.0 0.0 madd w0, w5, w4, w0 -# CHECK-NEXT: 3. 2 0.0 0.0 1.0 add x3, x3, x13 +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 add x3, x3, x13 # CHECK-NEXT: 4. 2 0.0 0.0 0.0 subs x1, x1, #1 # CHECK-NEXT: 5. 2 0.0 0.0 0.0 str w0, [x21, x18, lsl #2] -# CHECK-NEXT: 2 0.0 0.0 0.2 +# CHECK-NEXT: 2 0.0 0.0 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s index 71c1a0620607..de5dbaa3490c 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s +++ b/llvm/test/tools/llvm-mca/AArch64/Cortex/A55-in-order-retire.s @@ -10,12 +10,12 @@ add w7, w9, w0 # CHECK: Iterations: 2 # CHECK-NEXT: Instructions: 12 -# CHECK-NEXT: Total Cycles: 18 +# CHECK-NEXT: Total Cycles: 20 # CHECK-NEXT: Total uOps: 12 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.67 -# CHECK-NEXT: IPC: 0.67 +# CHECK-NEXT: uOps Per Cycle: 0.60 +# CHECK-NEXT: IPC: 0.60 # CHECK-NEXT: Block RThroughput: 8.0 # CHECK: Instruction Info: @@ -40,33 +40,37 @@ add w7, w9, w0 # CHECK-NEXT: SCHEDQ - Scheduler full: 0 # CHECK-NEXT: LQ - Load queue full: 0 # CHECK-NEXT: SQ - Store queue full: 0 -# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 5 (27.8%) +# CHECK-NEXT: GROUP - Static restrictions on the dispatch group: 1 (5.0%) # CHECK: Dispatch Logic - number of cycles where we saw N micro opcodes dispatched: # CHECK-NEXT: [# dispatched], [# cycles] -# CHECK-NEXT: 0, 12 (66.7%) -# CHECK-NEXT: 2, 6 (33.3%) +# CHECK-NEXT: 0, 12 (60.0%) +# CHECK-NEXT: 1, 4 (20.0%) +# CHECK-NEXT: 2, 4 (20.0%) # CHECK: Schedulers - number of cycles where we saw N micro opcodes issued: # CHECK-NEXT: [# issued], [# cycles] -# CHECK-NEXT: 0, 12 (66.7%) -# CHECK-NEXT: 2, 6 (33.3%) +# CHECK-NEXT: 0, 12 (60.0%) +# CHECK-NEXT: 1, 4 (20.0%) +# CHECK-NEXT: 2, 4 (20.0%) # CHECK: Scheduler's queue usage: # CHECK-NEXT: No scheduler resources used. # CHECK: Retire Control Unit - number of cycles where we saw N instructions retired: # CHECK-NEXT: [# retired], [# cycles] -# CHECK-NEXT: 0, 16 (88.9%) -# CHECK-NEXT: 6, 2 (11.1%) +# CHECK-NEXT: 0, 14 (70.0%) +# CHECK-NEXT: 1, 2 (10.0%) +# CHECK-NEXT: 2, 2 (10.0%) +# CHECK-NEXT: 3, 2 (10.0%) # CHECK: Total ROB Entries: 64 -# CHECK-NEXT: Max Used ROB Entries: 8 ( 12.5% ) -# CHECK-NEXT: Average Used ROB Entries per cy: 5 ( 7.8% ) +# CHECK-NEXT: Max Used ROB Entries: 7 ( 10.9% ) +# CHECK-NEXT: Average Used ROB Entries per cy: 2 ( 3.1% ) # CHECK: Register File statistics: # CHECK-NEXT: Total number of mappings created: 12 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Max number of mappings used: 7 # CHECK: Resources: # CHECK-NEXT: [0.0] - CortexA55UnitALU @@ -96,21 +100,21 @@ add w7, w9, w0 # CHECK-NEXT: 0.50 0.50 - - - - - - - - - - add w7, w9, w0 # CHECK: Timeline view: -# CHECK-NEXT: 01234567 +# CHECK-NEXT: 0123456789 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeeeeeeeER. . . sdiv w12, w21, w0 -# CHECK-NEXT: [0,1] DeeE-----R. . . add w8, w8, #1 -# CHECK-NEXT: [0,2] .DeeE----R. . . add w1, w2, w0 -# CHECK-NEXT: [0,3] .DeeE----R. . . add w3, w4, #1 -# CHECK-NEXT: [0,4] . DeeE---R. . . add w5, w6, w0 -# CHECK-NEXT: [0,5] . DeeE---R. . . add w7, w9, w0 -# CHECK-NEXT: [1,0] . . DeeeeeeeER sdiv w12, w21, w0 -# CHECK-NEXT: [1,1] . . DeeE-----R add w8, w8, #1 -# CHECK-NEXT: [1,2] . . DeeE----R add w1, w2, w0 -# CHECK-NEXT: [1,3] . . DeeE----R add w3, w4, #1 -# CHECK-NEXT: [1,4] . . DeeE---R add w5, w6, w0 -# CHECK-NEXT: [1,5] . . DeeE---R add w7, w9, w0 +# CHECK: [0,0] DeeeeeeeER. . . sdiv w12, w21, w0 +# CHECK-NEXT: [0,1] . DeeER. . . add w8, w8, #1 +# CHECK-NEXT: [0,2] . DeeER. . . add w1, w2, w0 +# CHECK-NEXT: [0,3] . .DeeER . . add w3, w4, #1 +# CHECK-NEXT: [0,4] . .DeeER . . add w5, w6, w0 +# CHECK-NEXT: [0,5] . . DeeER . . add w7, w9, w0 +# CHECK-NEXT: [1,0] . . DeeeeeeeER . sdiv w12, w21, w0 +# CHECK-NEXT: [1,1] . . . DeeER . add w8, w8, #1 +# CHECK-NEXT: [1,2] . . . DeeER . add w1, w2, w0 +# CHECK-NEXT: [1,3] . . . DeeER. add w3, w4, #1 +# CHECK-NEXT: [1,4] . . . DeeER. add w5, w6, w0 +# CHECK-NEXT: [1,5] . . . DeeER add w7, w9, w0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -120,9 +124,9 @@ add w7, w9, w0 # CHECK: [0] [1] [2] [3] # CHECK-NEXT: 0. 2 0.0 0.0 0.0 sdiv w12, w21, w0 -# CHECK-NEXT: 1. 2 0.0 0.0 5.0 add w8, w8, #1 -# CHECK-NEXT: 2. 2 0.0 0.0 4.0 add w1, w2, w0 -# CHECK-NEXT: 3. 2 0.0 0.0 4.0 add w3, w4, #1 -# CHECK-NEXT: 4. 2 0.0 0.0 3.0 add w5, w6, w0 -# CHECK-NEXT: 5. 2 0.0 0.0 3.0 add w7, w9, w0 -# CHECK-NEXT: 2 0.0 0.0 3.2 +# CHECK-NEXT: 1. 2 0.0 0.0 0.0 add w8, w8, #1 +# CHECK-NEXT: 2. 2 0.0 0.0 0.0 add w1, w2, w0 +# CHECK-NEXT: 3. 2 0.0 0.0 0.0 add w3, w4, #1 +# CHECK-NEXT: 4. 2 0.0 0.0 0.0 add w5, w6, w0 +# CHECK-NEXT: 5. 2 0.0 0.0 0.0 add w7, w9, w0 +# CHECK-NEXT: 2 0.0 0.0 0.0 -- GitLab From eefda605fe1701937a8fe5379357e0990ad2fb4e Mon Sep 17 00:00:00 2001 From: Andrzej Warzynski Date: Mon, 8 Mar 2021 16:54:11 +0000 Subject: [PATCH 0040/1000] [flang][driver] Add support for `-fget-symbols-sources` Adds support for `-fget-symbols-sources` in the new Flang driver. All relevant tests are updated to use the new driver when `FLANG_BUILD_NEW_DRIVER` is set. `RUN` lines in tests are updated so `-fsyntax-only` comes before `-fget-symbols-sources`. That's because: * both `-fsyntax-only` and `-fget-symbols-sources` are action flags, and * the new driver, flang-new, will only consider the right-most action flag. In other words, this change is needed so that the tests work with both `f18` (requires both flags) and `flang-new` (only considers the last action flag). Differential Revision: https://reviews.llvm.org/D98191 --- clang/include/clang/Driver/Options.td | 2 ++ flang/include/flang/Frontend/FrontendActions.h | 4 ++++ flang/include/flang/Frontend/FrontendOptions.h | 5 ++++- flang/lib/Frontend/CompilerInvocation.cpp | 3 +++ flang/lib/Frontend/FrontendActions.cpp | 10 ++++++++++ flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp | 3 +++ flang/test/Driver/driver-help.f90 | 1 + flang/test/Semantics/getsymbols01.f90 | 2 +- flang/test/Semantics/getsymbols02.f90 | 6 +++--- flang/test/Semantics/getsymbols03-a.f90 | 2 +- flang/test/Semantics/getsymbols04.f90 | 2 +- flang/test/Semantics/getsymbols05.f90 | 2 +- 12 files changed, 34 insertions(+), 8 deletions(-) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a3a0d86c054a..55dddab6160c 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4364,6 +4364,8 @@ def fdebug_pre_fir_tree : Flag<["-"], "fdebug-pre-fir-tree">, Group; def fdebug_module_writer : Flag<["-"],"fdebug-module-writer">, HelpText<"Enable debug messages while writing module files">; +def fget_symbols_sources : Flag<["-"], "fget-symbols-sources">, Group, + HelpText<"Dump symbols and their source code locations">; } diff --git a/flang/include/flang/Frontend/FrontendActions.h b/flang/include/flang/Frontend/FrontendActions.h index 35d1e6f29b0f..f49f9f4714b5 100644 --- a/flang/include/flang/Frontend/FrontendActions.h +++ b/flang/include/flang/Frontend/FrontendActions.h @@ -100,6 +100,10 @@ class DebugPreFIRTreeAction : public PrescanAndSemaAction { void ExecuteAction() override; }; +class GetSymbolsSourcesAction : public PrescanAndSemaAction { + void ExecuteAction() override; +}; + class ParseSyntaxOnlyAction : public PrescanAndSemaAction { void ExecuteAction() override; }; diff --git a/flang/include/flang/Frontend/FrontendOptions.h b/flang/include/flang/Frontend/FrontendOptions.h index 48182f488466..1d9002335c3c 100644 --- a/flang/include/flang/Frontend/FrontendOptions.h +++ b/flang/include/flang/Frontend/FrontendOptions.h @@ -58,7 +58,10 @@ enum ActionKind { DebugMeasureParseTree, /// Parse, run semantics and then output the pre-FIR tree - DebugPreFIRTree + DebugPreFIRTree, + + /// Parse, run semantics and then dump symbol sources map + GetSymbolsSources /// TODO: RunPreprocessor, EmitLLVM, EmitLLVMOnly, /// EmitCodeGenOnly, EmitAssembly, (...) diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 1271cd314831..d2318d3d683d 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -143,6 +143,9 @@ static InputKind ParseFrontendArgs(FrontendOptions &opts, case clang::driver::options::OPT_fdebug_pre_fir_tree: opts.programAction_ = DebugPreFIRTree; break; + case clang::driver::options::OPT_fget_symbols_sources: + opts.programAction_ = GetSymbolsSources; + break; // TODO: // case calng::driver::options::OPT_emit_llvm: diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index ea283fe7a0c9..1871a35444db 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -16,6 +16,7 @@ #include "flang/Parser/provenance.h" #include "flang/Parser/source.h" #include "flang/Parser/unparse.h" +#include "flang/Semantics/runtime-type-info.h" #include "flang/Semantics/semantics.h" #include "flang/Semantics/unparse-with-symbols.h" #include "llvm/ADT/StringRef.h" @@ -314,6 +315,15 @@ void DebugDumpParsingLogAction::ExecuteAction() { ci.parsing().DumpParsingLog(llvm::outs()); } +void GetSymbolsSourcesAction::ExecuteAction() { + // Report and exit if fatal semantic errors are present + if (reportFatalSemanticErrors(semantics(), this->instance().diagnostics(), + GetCurrentFileOrBufferName())) + return; + + semantics().DumpSymbolsSources(llvm::outs()); +} + void EmitObjAction::ExecuteAction() { CompilerInstance &ci = this->instance(); unsigned DiagID = ci.diagnostics().getCustomDiagID( diff --git a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index 041e79b946f5..2a08e388a9d8 100644 --- a/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/flang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -61,6 +61,9 @@ static std::unique_ptr CreateFrontendBaseAction( case DebugPreFIRTree: return std::make_unique(); break; + case GetSymbolsSources: + return std::make_unique(); + break; default: break; // TODO: diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90 index c32975416f2f..0c7e37f2bc72 100644 --- a/flang/test/Driver/driver-help.f90 +++ b/flang/test/Driver/driver-help.f90 @@ -80,6 +80,7 @@ ! HELP-FC1-NEXT: -ffixed-line-length= ! HELP-FC1-NEXT: Use as character line width in fixed mode ! HELP-FC1-NEXT: -ffree-form Process source files in free form +! HELP-FC1-NEXT: -fget-symbols-sources Dump symbols and their source code locations ! HELP-FC1-NEXT: -fimplicit-none No implicit typing allowed unless overridden by IMPLICIT statements ! HELP-FC1-NEXT: -finput-charset= Specify the default character set for source files ! HELP-FC1-NEXT: -flarge-sizes Use INTEGER(KIND=8) for the result type in size-related intrinsics diff --git a/flang/test/Semantics/getsymbols01.f90 b/flang/test/Semantics/getsymbols01.f90 index d26aa774ace4..9a52ee7cbf2a 100644 --- a/flang/test/Semantics/getsymbols01.f90 +++ b/flang/test/Semantics/getsymbols01.f90 @@ -15,7 +15,7 @@ contains end function end module -! RUN: %f18 -fget-symbols-sources -fsyntax-only %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -fsyntax-only -fget-symbols-sources %s 2>&1 | FileCheck %s ! CHECK-COUNT-1:f:{{.*}}getsymbols01.f90, 12, 26-27 ! CHECK-COUNT-1:mm1:{{.*}}getsymbols01.f90, 2, 8-11 ! CHECK-COUNT-1:s:{{.*}}getsymbols01.f90, 5, 18-19 diff --git a/flang/test/Semantics/getsymbols02.f90 b/flang/test/Semantics/getsymbols02.f90 index 1667548f81c3..32929904fb7a 100644 --- a/flang/test/Semantics/getsymbols02.f90 +++ b/flang/test/Semantics/getsymbols02.f90 @@ -7,8 +7,8 @@ PROGRAM helloworld i = callget5() ENDPROGRAM -! RUN: %f18 -fsyntax-only %S/Inputs/getsymbols02-a.f90 -! RUN: %f18 -fsyntax-only %S/Inputs/getsymbols02-b.f90 -! RUN: %f18 -fget-symbols-sources -fsyntax-only %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -fsyntax-only %S/Inputs/getsymbols02-a.f90 +! RUN: %flang_fc1 -fsyntax-only %S/Inputs/getsymbols02-b.f90 +! RUN: %flang_fc1 -fsyntax-only -fget-symbols-sources %s 2>&1 | FileCheck %s ! CHECK: callget5: .{{[/\\]}}mm2b.mod, ! CHECK: get5: .{{[/\\]}}mm2a.mod, diff --git a/flang/test/Semantics/getsymbols03-a.f90 b/flang/test/Semantics/getsymbols03-a.f90 index fddf513bcc51..0bc19b4fe8d0 100644 --- a/flang/test/Semantics/getsymbols03-a.f90 +++ b/flang/test/Semantics/getsymbols03-a.f90 @@ -7,7 +7,7 @@ program main x = f end program -! RUN: %f18 -fget-symbols-sources -fsyntax-only %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -fsyntax-only -fget-symbols-sources %s 2>&1 | FileCheck %s ! CHECK:f:{{.*}}getsymbols03-b.f90, 2, 12-13 ! CHECK:main:{{.*}}getsymbols03-a.f90, 4, 9-13 ! CHECK:mm3:{{.*}}getsymbols03-a.f90, 5, 6-9 diff --git a/flang/test/Semantics/getsymbols04.f90 b/flang/test/Semantics/getsymbols04.f90 index ac8f2d0a7e44..28027ea759b6 100644 --- a/flang/test/Semantics/getsymbols04.f90 +++ b/flang/test/Semantics/getsymbols04.f90 @@ -6,7 +6,7 @@ program main x = y end program -! RUN: %f18 -fget-symbols-sources -fsyntax-only %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -fsyntax-only -fget-symbols-sources %s 2>&1 | FileCheck %s ! CHECK:x:{{.*}}getsymbols04.f90, 3, 14-15 ! CHECK:x:{{.*}}getsymbols04.f90, 5, 11-12 ! CHECK:y:{{.*}}getsymbols04.f90, 4, 14-15 diff --git a/flang/test/Semantics/getsymbols05.f90 b/flang/test/Semantics/getsymbols05.f90 index 6b07678e42d0..99771e227c3f 100644 --- a/flang/test/Semantics/getsymbols05.f90 +++ b/flang/test/Semantics/getsymbols05.f90 @@ -9,7 +9,7 @@ program main x = y end program -! RUN: %f18 -fget-symbols-sources -fsyntax-only %s 2>&1 | FileCheck %s +! RUN: %flang_fc1 -fsyntax-only -fget-symbols-sources %s 2>&1 | FileCheck %s ! CHECK:x:{{.*}}getsymbols05.f90, 3, 14-15 ! CHECK:x:{{.*}}getsymbols05.f90, 6, 16-17 ! CHECK:y:{{.*}}getsymbols05.f90, 4, 14-15 -- GitLab From 68bb51acd572735d80d20adb2c2fc51a5cbbd88e Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 14 Jan 2021 15:06:24 +0100 Subject: [PATCH 0041/1000] [lldb] Fix TestAutoInstallMainExecutable.py Fix the test to account for recent test infrastructure changes, and make it run locally to increase the chances of it continuing to work in the future. --- .../TestAutoInstallMainExecutable.py | 66 ++++++++----------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py b/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py index 5afb57f3ac46..92151cea4e67 100644 --- a/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py +++ b/lldb/test/API/commands/target/auto-install-main-executable/TestAutoInstallMainExecutable.py @@ -2,61 +2,56 @@ Test target commands: target.auto-install-main-executable. """ +import socket import time -import gdbremote_testcase +import lldbgdbserverutils from lldbsuite.test.decorators import * from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil -class TestAutoInstallMainExecutable(gdbremote_testcase.GdbRemoteTestCaseBase): +class TestAutoInstallMainExecutable(TestBase): mydir = TestBase.compute_mydir(__file__) + NO_DEBUG_INFO_TESTCASE = True - @llgs_test - @no_debug_info_test - @skipIf(remote=False) - @expectedFailureAll(hostoslist=["windows"], triple='.*-android') + @skipIfRemote + @expectedFailureAll(oslist=["windows"]) # process modules not loaded def test_target_auto_install_main_executable(self): self.build() - # Manually install the modified binary. - working_dir = lldb.remote_platform.GetWorkingDirectory() - src_device = lldb.SBFileSpec(self.getBuildArtifact("a.device.out")) - dest = lldb.SBFileSpec(os.path.join(working_dir, "a.out")) - err = lldb.remote_platform.Put(src_device, dest) - if err.Fail(): - raise RuntimeError( - "Unable copy '%s' to '%s'.\n>>> %s" % - (src_device.GetFilename(), working_dir, err.GetCString())) - - m = re.search("^(.*)://([^/]*):(.*)$", configuration.lldb_platform_url) - protocol = m.group(1) - hostname = m.group(2) - hostport = int(m.group(3)) - listen_url = "*:"+str(hostport+1) + hostname = socket.getaddrinfo("localhost", 0, proto=socket.IPPROTO_TCP)[0][4][0] + listen_url = "[%s]:0"%hostname + port_file = self.getBuildArtifact("port") commandline_args = [ "platform", "--listen", listen_url, - "--server" - ] - + "--socket-file", + port_file] self.spawnSubprocess( - self.debug_monitor_exe, - commandline_args, - install_remote=False) + lldbgdbserverutils.get_lldb_server_exe(), + commandline_args) - # Wait for the new process gets ready. - time.sleep(0.1) + socket_id = lldbutil.wait_for_file_on_target(self, port_file) - self.dbg.SetAsync(False) - - new_platform = lldb.SBPlatform(lldb.remote_platform.GetName()) + new_platform = lldb.SBPlatform("remote-" + self.getPlatform()) self.dbg.SetSelectedPlatform(new_platform) - connect_url = "%s://%s:%s" % (protocol, hostname, str(hostport+1)) + connect_url = "connect://[%s]:%s" % (hostname, socket_id) + connect_opts = lldb.SBPlatformConnectOptions(connect_url) + self.assertSuccess(new_platform.ConnectRemote(connect_opts)) + + wd = self.getBuildArtifact("wd") + os.mkdir(wd) + new_platform.SetWorkingDirectory(wd) + + + # Manually install the modified binary. + src_device = lldb.SBFileSpec(self.getBuildArtifact("a.device.out")) + dest = lldb.SBFileSpec(os.path.join(wd, "a.out")) + self.assertSuccess(new_platform.Put(src_device, dest)) # Test the default setting. self.expect("settings show target.auto-install-main-executable", @@ -68,12 +63,9 @@ class TestAutoInstallMainExecutable(gdbremote_testcase.GdbRemoteTestCaseBase): self.expect("settings show target.auto-install-main-executable", substrs=["target.auto-install-main-executable (boolean) = false"]) - self.runCmd("platform select %s"%configuration.lldb_platform_name) - self.runCmd("platform connect %s" % (connect_url)) - # Create the target with the original file. self.runCmd("target create --remote-file %s %s "% - (os.path.join(working_dir,dest.GetFilename()), + (dest.fullpath, self.getBuildArtifact("a.out"))) target = self.dbg.GetSelectedTarget() -- GitLab From ed8bff13dcaa123721e0117fb586c3124c03a421 Mon Sep 17 00:00:00 2001 From: "caoming.roy" Date: Thu, 18 Mar 2021 10:38:30 -0400 Subject: [PATCH 0042/1000] [lld-macho] implement options -map Implement command-line options -map Reviewed By: int3, #lld-macho Differential Revision: https://reviews.llvm.org/D98323 --- lld/MachO/CMakeLists.txt | 1 + lld/MachO/Config.h | 1 + lld/MachO/Driver.cpp | 1 + lld/MachO/MapFile.cpp | 151 ++++++++++++++++++++++++++++++++++++++ lld/MachO/MapFile.h | 18 +++++ lld/MachO/Options.td | 1 - lld/MachO/Writer.cpp | 2 + lld/test/MachO/map-file.s | 50 +++++++++++++ 8 files changed, 224 insertions(+), 1 deletion(-) create mode 100644 lld/MachO/MapFile.cpp create mode 100644 lld/MachO/MapFile.h create mode 100644 lld/test/MachO/map-file.s diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt index 8eb3371580b7..16b372945d07 100644 --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -24,6 +24,7 @@ add_lld_library(lldMachO2 Symbols.cpp SyntheticSections.cpp Target.cpp + MapFile.cpp Writer.cpp LINK_COMPONENTS diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h index 93c6a11c0808..611440185837 100644 --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -88,6 +88,7 @@ struct Configuration { uint32_t timeTraceGranularity; std::string progName; llvm::StringRef installName; + llvm::StringRef mapFile; llvm::StringRef outputFile; llvm::StringRef ltoObjPath; bool demangle = false; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index 207dc4f36e6a..c85b72564213 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -826,6 +826,7 @@ bool macho::link(ArrayRef argsArr, bool canExitEarly, for (const Arg *arg : args.filtered(OPT_U)) symtab->addDynamicLookup(arg->getValue()); + config->mapFile = args.getLastArgValue(OPT_map); config->outputFile = args.getLastArgValue(OPT_o, "a.out"); config->headerPad = args::getHex(args, OPT_headerpad, /*Default=*/32); config->headerPadMaxInstallNames = diff --git a/lld/MachO/MapFile.cpp b/lld/MachO/MapFile.cpp new file mode 100644 index 000000000000..e089136ee218 --- /dev/null +++ b/lld/MachO/MapFile.cpp @@ -0,0 +1,151 @@ +//===- MapFile.cpp --------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the -map option. It shows lists in order and +// hierarchically the outputFile, arch, input files, output sections and +// symbol: +// +// # Path: test +// # Arch: x86_84 +// # Object files: +// [ 0] linker synthesized +// [ 1] a.o +// # Sections: +// # Address Size Segment Section +// 0x1000005C0 0x0000004C __TEXT __text +// # Symbols: +// # Address File Name +// 0x1000005C0 [ 1] _main +// +//===----------------------------------------------------------------------===// + +#include "MapFile.h" +#include "Config.h" +#include "InputFiles.h" +#include "InputSection.h" +#include "OutputSection.h" +#include "OutputSegment.h" +#include "Symbols.h" +#include "Target.h" +#include "llvm/Support/Parallel.h" + +using namespace llvm; +using namespace llvm::sys; +using namespace lld; +using namespace lld::macho; + +using SymbolMapTy = DenseMap>; + +// Returns a map from sections to their symbols. +static SymbolMapTy getSectionSyms(ArrayRef syms) { + SymbolMapTy ret; + for (Defined *dr : syms) + ret[dr->isec].push_back(dr); + + // Sort symbols by address. We want to print out symbols in the + // order in the output file rather than the order they appeared + // in the input files. + for (auto &it : ret) + llvm::stable_sort(it.second, [](Defined *a, Defined *b) { + return a->getVA() < b->getVA(); + }); + return ret; +} + +// Returns a list of all symbols that we want to print out. +static std::vector getSymbols() { + std::vector v; + for (InputFile *file : inputFiles) + if (isa(file)) + for (Symbol *sym : file->symbols) { + if (sym == nullptr) + continue; + if (auto *d = dyn_cast(sym)) + if (d->isec && d->getFile() == file) + v.push_back(d); + } + return v; +} + +// Construct a map from symbols to their stringified representations. +// Demangling symbols (which is what toString() does) is slow, so +// we do that in batch using parallel-for. +static DenseMap +getSymbolStrings(ArrayRef syms) { + std::vector str(syms.size()); + parallelForEachN(0, syms.size(), [&](size_t i) { + raw_string_ostream os(str[i]); + os << toString(*syms[i]); + }); + + DenseMap ret; + for (size_t i = 0, e = syms.size(); i < e; ++i) + ret[syms[i]] = std::move(str[i]); + return ret; +} + +void macho::writeMapFile() { + if (config->mapFile.empty()) + return; + + // Open a map file for writing. + std::error_code ec; + raw_fd_ostream os(config->mapFile, ec, sys::fs::OF_None); + if (ec) { + error("cannot open " + config->mapFile + ": " + ec.message()); + return; + } + + // Dump output path + os << format("# Path: %s\n", config->outputFile.str().c_str()); + + // Dump output architecure + os << format("# Arch: %s\n", + getArchitectureName(config->target.Arch).str().c_str()); + + // Dump table of object files + os << "# Object files:\n"; + os << format("[%3u] %s\n", 0, (const char *)"linker synthesized"); + uint32_t fileIndex = 1; + DenseMap readerToFileOrdinal; + for (InputFile *file : inputFiles) { + if (isa(file)) { + os << format("[%3u] %s\n", fileIndex, file->getName().str().c_str()); + readerToFileOrdinal[file] = fileIndex++; + } + } + + // Collect symbol info that we want to print out. + std::vector syms = getSymbols(); + SymbolMapTy sectionSyms = getSectionSyms(syms); + DenseMap symStr = getSymbolStrings(syms); + + // Dump table of sections + os << "# Sections:\n"; + os << "# Address\tSize \tSegment\tSection\n"; + for (OutputSegment *seg : outputSegments) + for (OutputSection *osec : seg->getSections()) { + if (osec->isHidden()) + continue; + + os << format("0x%08llX\t0x%08llX\t%s\t%s\n", osec->addr, osec->getSize(), + seg->name.str().c_str(), osec->name.str().c_str()); + } + + // Dump table of symbols + os << "# Symbols:\n"; + os << "# Address\t File Name\n"; + for (InputSection *isec : inputSections) { + for (macho::Symbol *sym : sectionSyms[isec]) { + os << format("0x%08llX\t[%3u] %s\n", sym->getVA(), + readerToFileOrdinal[sym->getFile()], symStr[sym].c_str()); + } + } + + // TODO: when we implement -dead_strip, we should dump dead stripped symbols +} diff --git a/lld/MachO/MapFile.h b/lld/MachO/MapFile.h new file mode 100644 index 000000000000..bf16ffdd0382 --- /dev/null +++ b/lld/MachO/MapFile.h @@ -0,0 +1,18 @@ +//===- MapFile.h ------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_MACHO_MAPFILE_H +#define LLD_MACHO_MAPFILE_H + +namespace lld { +namespace macho { +void writeMapFile(); +} // namespace macho +} // namespace lld + +#endif diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index 6af0d2c4152f..af8e44e73724 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -500,7 +500,6 @@ def order_file_statistics : Flag<["-"], "order_file_statistics">, def map : Separate<["-"], "map">, MetaVarName<"">, HelpText<"Writes all symbols and their addresses to ">, - Flags<[HelpHidden]>, Group; def dependency_info : Separate<["-"], "dependency_info">, MetaVarName<"">, diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index 62f4eef19498..b2d316355807 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -10,6 +10,7 @@ #include "Config.h" #include "InputFiles.h" #include "InputSection.h" +#include "MapFile.h" #include "MergedOutputSection.h" #include "OutputSection.h" #include "OutputSegment.h" @@ -926,6 +927,7 @@ void Writer::run() { createLoadCommands(); finalizeAddressses(); finalizeLinkEditSegment(); + writeMapFile(); openFile(); if (errorCount()) return; diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s new file mode 100644 index 000000000000..ac5fb93898d3 --- /dev/null +++ b/lld/test/MachO/map-file.s @@ -0,0 +1,50 @@ +# REQUIRES: x86 +# RUN: rm -rf %t; split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/foo.s -o %t/foo.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/test.s -o %t/test.o + +# RUN: %lld -map %t/map %t/test.o %t/foo.o -o %t/test-map +# RUN: llvm-objdump --syms --section-headers %t/test-map > %t/objdump +# RUN: cat %t/objdump %t/map > %t/out +# RUN: FileCheck %s < %t/out + +#--- foo.s +.section __TEXT,obj +.globl _foo +_foo: + +#--- test.s +.comm _number, 1 +.globl _main +_main: + ret + +# CHECK: Sections: +# CHECK-NEXT: Idx Name Size VMA Type +# CHECK-NEXT: 0 __text {{[0-9a-f]+}} [[#%x,TEXT:]] TEXT +# CHECK-NEXT: 1 obj {{[0-9a-f]+}} [[#%x,DATA:]] DATA +# CHECK-NEXT: 2 __common {{[0-9a-f]+}} [[#%x,BSS:]] BSS + +# CHECK: SYMBOL TABLE: +# CHECK-NEXT: [[#%x,MAIN:]] g F __TEXT,__text _main +# CHECK-NEXT: [[#%x,NUMBER:]] g O __DATA,__common _number +# CHECK-NEXT: [[#%x,FOO:]] g O __TEXT,obj _foo + +# CHECK-NEXT: # Path: {{.*}}{{/|\\}}map-file.s.tmp/test-map +# CHECK-NEXT: # Arch: x86_64 +# CHECK-NEXT: # Object files: +# CHECK-NEXT: [ 0] linker synthesized +# CHECK-NEXT: [ 1] {{.*}}{{/|\\}}map-file.s.tmp/test.o +# CHECK-NEXT: [ 2] {{.*}}{{/|\\}}map-file.s.tmp/foo.o + +# CHECK-NEXT: # Sections: +# CHECK-NEXT: # Address Size Segment Section +# CHECK-NEXT: 0x[[#TEXT]] 0x{{[0-9a-f]+}} __TEXT __text +# CHECK-NEXT: 0x[[#DATA]] 0x{{[0-9a-f]+}} __TEXT obj +# CHECK-NEXT: 0x[[#BSS]] 0x{{[0-9a-f]+}} __DATA __common + +# CHECK-NEXT: # Symbols: +# CHECK-NEXT: # Address File Name +# CHECK-NEXT: 0x[[#NUMBER]] [ 1] _number +# CHECK-NEXT: 0x[[#MAIN]] [ 1] _main +# CHECK-NEXT: 0x[[#FOO]] [ 2] _foo -- GitLab From 6333ee2184f1941c1ac548d99ecd8807be499bf1 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 18 Mar 2021 14:39:37 +0000 Subject: [PATCH 0043/1000] [gn build] Port ed8bff13dcaa --- llvm/utils/gn/secondary/lld/MachO/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn index d0fe2e128d9e..98c38399b08b 100644 --- a/llvm/utils/gn/secondary/lld/MachO/BUILD.gn +++ b/llvm/utils/gn/secondary/lld/MachO/BUILD.gn @@ -31,6 +31,7 @@ static_library("MachO2") { "InputFiles.cpp", "InputSection.cpp", "LTO.cpp", + "MapFile.cpp", "MergedOutputSection.cpp", "ObjC.cpp", "OutputSection.cpp", -- GitLab From c539be1dcbcf88530cfaf1728b077feb564b72ec Mon Sep 17 00:00:00 2001 From: Sid Manning Date: Thu, 11 Mar 2021 11:44:57 -0600 Subject: [PATCH 0044/1000] [Hexagon] Add support for named registers cs0 and cs1 Allow inline assembly code to referece cs0 and cs1. --- clang/lib/Basic/Targets/Hexagon.cpp | 2 +- .../Target/Hexagon/HexagonISelLowering.cpp | 2 ++ llvm/test/CodeGen/Hexagon/namedreg.ll | 21 ++++++++++++++++++- 3 files changed, 23 insertions(+), 2 deletions(-) diff --git a/clang/lib/Basic/Targets/Hexagon.cpp b/clang/lib/Basic/Targets/Hexagon.cpp index ba10459e9690..d1613fb22930 100644 --- a/clang/lib/Basic/Targets/Hexagon.cpp +++ b/clang/lib/Basic/Targets/Hexagon.cpp @@ -136,7 +136,7 @@ const char *const HexagonTargetInfo::GCCRegNames[] = { "r9", "r10", "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31", "p0", "p1", "p2", "p3", - "sa0", "lc0", "sa1", "lc1", "m0", "m1", "usr", "ugp", + "sa0", "lc0", "sa1", "lc1", "m0", "m1", "usr", "ugp", "cs0", "cs1", "r1:0", "r3:2", "r5:4", "r7:6", "r9:8", "r11:10", "r13:12", "r15:14", "r17:16", "r19:18", "r21:20", "r23:22", "r25:24", "r27:26", "r29:28", "r31:30" diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 30c3d3d4f570..a7e9ed34bfcb 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -308,6 +308,8 @@ Register HexagonTargetLowering::getRegisterByName( .Case("m1", Hexagon::M1) .Case("usr", Hexagon::USR) .Case("ugp", Hexagon::UGP) + .Case("cs0", Hexagon::CS0) + .Case("cs1", Hexagon::CS1) .Default(Register()); if (Reg) return Reg; diff --git a/llvm/test/CodeGen/Hexagon/namedreg.ll b/llvm/test/CodeGen/Hexagon/namedreg.ll index 72ca50868828..a905332b2dee 100644 --- a/llvm/test/CodeGen/Hexagon/namedreg.ll +++ b/llvm/test/CodeGen/Hexagon/namedreg.ll @@ -4,10 +4,29 @@ entry: %0 = call i32 @llvm.read_register.i32(metadata !0) ret i32 %0 } - declare i32 @llvm.read_register.i32(metadata) #1 +define dso_local i32 @rcs0() #0 { +entry: + %0 = call i32 @llvm.read_register.i32(metadata !1) + ret i32 %0 +} + +define dso_local i32 @rcs1() #0 { +entry: + %0 = call i32 @llvm.read_register.i32(metadata !2) + ret i32 %0 +} + + + !llvm.named.register.r19 = !{!0} +!llvm.named.register.cs0 = !{!1} +!llvm.named.register.cs1 = !{!2} !0 = !{!"r19"} +!1 = !{!"cs0"} +!2 = !{!"cs1"} ; CHECK: r0 = r19 +; CHECK: r0 = cs0 +; CHECK: r0 = cs1 -- GitLab From 283799157e504597fc3034cc5fa02faa4e11fa58 Mon Sep 17 00:00:00 2001 From: Alexander Belyaev Date: Thu, 18 Mar 2021 16:04:02 +0100 Subject: [PATCH 0045/1000] [mlir][linalg] Add support for memref inputs/outputs for `linalg.tiled_loop`. Also use `ArrayAttr` to pass iterator pass to the TiledLoopOp builder. Differential Revision: https://reviews.llvm.org/D98871 --- .../mlir/Dialect/Linalg/IR/LinalgBase.td | 2 + .../mlir/Dialect/Linalg/IR/LinalgOps.td | 51 +++++++++++++++---- .../Dialect/Linalg/IR/LinalgStructuredOps.td | 2 - mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 15 ++++-- 4 files changed, 53 insertions(+), 17 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td index a706d67d2988..5a906ff2dafd 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgBase.td @@ -15,6 +15,8 @@ include "mlir/IR/OpBase.td" +def LinalgOperand: AnyTypeOf<[AnyRankedTensor, AnyStridedMemRef]>; + def Linalg_Dialect : Dialect { let name = "linalg"; let description = [{ diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index 63bee92ded7c..d54efbe37a57 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -496,21 +496,25 @@ def Linalg_TiledLoopOp : Linalg_Op<"tiled_loop", [ let summary = "Linalg tiled loop operation"; let description = [{ This is a loop-like operation with additional properties. The arguments - also include the input and the output tensors and the attributes to specify - the iterator types. The body region of the loop contains `subtensor` - operations applied to every tensor argument of TiledLoopOp. + also include the input and the output tensors or memrefs and the attributes + to specify the iterator types. + + Parsing TiledLoopOp will set all elements of the `iterator_types` attribute + to "parallel" type, when it is absent from the custom format. + + Tensor-based version: + + The body region of the loop contains `subtensor` operations applied to + every tensor argument of TiledLoopOp. The body region must contain exactly one block that terminates with `linalg.yield` with the operands resulting from `subtensor_insert` operations. - Parsing TiledLoopOp will set all elements of the `iterator_types` attribute - to "parallel" type, when it is absent from the custom format. - Example: ```mlir - linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4) + %0 = linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4) ins(%lhs, %rhs : tensor<24x64xi8>, tensor<24x64xi8>) outs(%out : tensor<24x64xi8>) iterators("parallel") { @@ -528,13 +532,40 @@ def Linalg_TiledLoopOp : Linalg_Op<"tiled_loop", [ linalg.yield %result : tensor<24x64xi8> } ``` + + MemRef-based version: + + The body region of the loop contains `subview` operations applied to + every memref argument of TiledLoopOp. + + The body region must contain exactly one block that terminates with + `linalg.yield` with no operands. + + Example: + + ```mlir + linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4) + ins(%lhs, %rhs : memref<24x64xi8>, memref<24x64xi8>) + outs(%out : memref<24x64xi8>) + iterators("parallel") { + %lhs_sub = subview %lhs[%i, 0] [%c4, %c64] [1, 1] + : memref<24x64xi8> to memref + %rhs_sub = subview %rhs[%i, 0] [%c4, %c64] [1, 1] + : memref<24x64xi8> to memref + %out_sub = subview %out[%i, 0] [%c4, %c64] [1, 1] + : memref<24x64xi8> to memref + + %result_sub = linalg.generic ... + linalg.yield + } + ``` }]; let arguments = (ins Variadic:$lowerBound, Variadic:$upperBound, Variadic:$step, - Variadic:$inputs, - Variadic:$outputs, + Variadic:$inputs, + Variadic:$outputs, ArrayAttr:$iterator_types); let results = (outs Variadic:$results); let regions = (region SizedRegion<1>:$region); @@ -542,7 +573,7 @@ def Linalg_TiledLoopOp : Linalg_Op<"tiled_loop", [ let builders = [ OpBuilder<(ins "ValueRange":$lowerBounds, "ValueRange":$upperBounds, "ValueRange":$steps, "ValueRange":$inputs, "ValueRange":$outputs, - "ArrayRef":$iteratorTypes, + "ArrayAttr":$iteratorTypes, CArg<"function_ref", "nullptr">:$bodyBuilderFn)>, ]; diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index f87a1eaeac8f..69aa7659b81c 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -496,8 +496,6 @@ def PoolingSumOp: SingleInputPoolingBase_Op<"pooling_sum"> { //===----------------------------------------------------------------------===// // Generic Linalg ops. //===----------------------------------------------------------------------===// -def LinalgOperand: AnyTypeOf<[AnyRankedTensor, AnyStridedMemRef]>; - class LinalgOperandOfRank: Type< And<[ LinalgOperand.predicate, diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 3b268d703a74..13cca7f19ee7 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -1744,7 +1744,7 @@ static LogicalResult verify(linalg::YieldOp op) { void TiledLoopOp::build( OpBuilder &builder, OperationState &result, ValueRange lowerBounds, ValueRange upperBounds, ValueRange steps, ValueRange inputs, - ValueRange outputs, ArrayRef iteratorTypes, + ValueRange outputs, ArrayAttr iteratorTypes, function_ref bodyBuilderFn) { result.addOperands(lowerBounds); result.addOperands(upperBounds); @@ -1758,9 +1758,14 @@ void TiledLoopOp::build( static_cast(steps.size()), static_cast(inputs.size()), static_cast(outputs.size())})); - result.addAttribute(getIteratorTypesAttrName(), - builder.getStrArrayAttr(iteratorTypes)); - result.addTypes(outputs.getTypes()); + result.addAttribute(getIteratorTypesAttrName(), iteratorTypes); + + // Add output types for `RankedTensorType` output arguments. + for (Value output : outputs) { + Type outputType = output.getType(); + if (outputType.isa()) + result.addTypes(outputType); + } OpBuilder::InsertionGuard guard(builder); unsigned numIVs = steps.size(); @@ -1771,8 +1776,8 @@ void TiledLoopOp::build( if (bodyBuilderFn) { builder.setInsertionPointToStart(bodyBlock); bodyBuilderFn(builder, result.location, bodyBlock->getArguments()); + TiledLoopOp::ensureTerminator(*bodyRegion, builder, result.location); } - TiledLoopOp::ensureTerminator(*bodyRegion, builder, result.location); } static void print(OpAsmPrinter &p, TiledLoopOp op) { -- GitLab From 1ba5c550d418e12a5bdbb884d2f7d94e0efc64ee Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Mar 2021 15:34:39 +0000 Subject: [PATCH 0046/1000] [DAG] Improve folding (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) Extend this to support ComputeNumSignBits of the (used) source vector elements so that we can handle more than just the case where we're sext_in_reg from the source element signbit. Noticed while investigating the poor codegen in D98587. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +- llvm/test/CodeGen/X86/vec_saddo.ll | 34 +- llvm/test/CodeGen/X86/vec_smulo.ll | 650 +++++++++--------- llvm/test/CodeGen/X86/vec_ssubo.ll | 34 +- llvm/test/CodeGen/X86/vec_uaddo.ll | 34 +- llvm/test/CodeGen/X86/vec_umulo.ll | 460 ++++++------- llvm/test/CodeGen/X86/vec_usubo.ll | 34 +- llvm/test/CodeGen/X86/vsplit-and.ll | 11 +- 8 files changed, 589 insertions(+), 692 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 120c7f244c6a..1c063dae9d88 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11787,14 +11787,22 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) { } // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x) - if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || - N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || - N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && - N0.getOperand(0).getScalarValueSizeInBits() == ExtVTBits) { - if (!LegalOperations || - TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)) - return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, - N0.getOperand(0)); + // if x is small enough or if we know that x has more than 1 sign bit and the + // sign_extend_inreg is extending from one of them. + if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG || + N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) { + SDValue N00 = N0.getOperand(0); + unsigned N00Bits = N00.getScalarValueSizeInBits(); + unsigned DstElts = N0.getValueType().getVectorMinNumElements(); + unsigned SrcElts = N00.getValueType().getVectorMinNumElements(); + APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts); + if ((N00Bits == ExtVTBits || + (N00Bits - DAG.ComputeNumSignBits(N00, DemandedSrcElts)) < + ExtVTBits) && + (!LegalOperations || + TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))) + return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00); } // fold (sext_in_reg (zext x)) -> (sext x) diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll index ede201903a47..f07b4db1388f 100644 --- a/llvm/test/CodeGen/X86/vec_saddo.ll +++ b/llvm/test/CodeGen/X86/vec_saddo.ll @@ -539,12 +539,11 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -568,12 +567,11 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -597,9 +595,7 @@ define <16 x i32> @saddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -673,10 +669,8 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -692,10 +686,8 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -711,9 +703,7 @@ define <8 x i32> @saddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_smulo.ll b/llvm/test/CodeGen/X86/vec_smulo.ll index 4fa3367521a5..ed2d78493975 100644 --- a/llvm/test/CodeGen/X86/vec_smulo.ll +++ b/llvm/test/CodeGen/X86/vec_smulo.ll @@ -1292,12 +1292,11 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm0, %xmm7 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm7, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -1344,12 +1343,11 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm7 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm7, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -1392,9 +1390,7 @@ define <16 x i32> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm0, %xmm6 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm6, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -1527,272 +1523,268 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE2-LABEL: smulo_v32i8: ; SSE2: # %bb.0: ; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSE2-NEXT: movdqa %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm10 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pmullw %xmm3, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE2-NEXT: pand %xmm8, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm10, %xmm9 -; SSE2-NEXT: pand %xmm8, %xmm9 -; SSE2-NEXT: packuswb %xmm1, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm9, %xmm10 +; SSE2-NEXT: pand %xmm8, %xmm10 +; SSE2-NEXT: packuswb %xmm1, %xmm10 ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pcmpgtb %xmm9, %xmm1 -; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: pcmpgtb %xmm10, %xmm1 +; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: pmullw %xmm7, %xmm6 +; SSE2-NEXT: pmullw %xmm4, %xmm6 ; SSE2-NEXT: psrlw $8, %xmm6 -; SSE2-NEXT: psraw $8, %xmm4 ; SSE2-NEXT: psraw $8, %xmm5 -; SSE2-NEXT: pmullw %xmm4, %xmm5 -; SSE2-NEXT: psrlw $8, %xmm5 -; SSE2-NEXT: packuswb %xmm6, %xmm5 -; SSE2-NEXT: pcmpeqb %xmm1, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSE2-NEXT: psraw $8, %xmm7 +; SSE2-NEXT: pmullw %xmm5, %xmm7 +; SSE2-NEXT: psrlw $8, %xmm7 +; SSE2-NEXT: packuswb %xmm6, %xmm7 +; SSE2-NEXT: pcmpeqb %xmm1, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm0, %xmm11 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: pmullw %xmm7, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm10, %xmm2 -; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: pmullw %xmm4, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm9, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: pand %xmm8, %xmm0 -; SSE2-NEXT: pand %xmm8, %xmm2 +; SSE2-NEXT: pand %xmm8, %xmm11 ; SSE2-NEXT: pcmpeqd %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm8, %xmm5 -; SSE2-NEXT: packuswb %xmm0, %xmm2 -; SSE2-NEXT: pcmpgtb %xmm2, %xmm7 -; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: pxor %xmm8, %xmm7 +; SSE2-NEXT: packuswb %xmm0, %xmm11 +; SSE2-NEXT: pcmpgtb %xmm11, %xmm4 +; SSE2-NEXT: psraw $8, %xmm5 ; SSE2-NEXT: psraw $8, %xmm3 -; SSE2-NEXT: pmullw %xmm4, %xmm3 +; SSE2-NEXT: pmullw %xmm5, %xmm3 ; SSE2-NEXT: psrlw $8, %xmm3 ; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: psraw $8, %xmm1 -; SSE2-NEXT: pmullw %xmm6, %xmm1 -; SSE2-NEXT: psrlw $8, %xmm1 -; SSE2-NEXT: packuswb %xmm3, %xmm1 -; SSE2-NEXT: pcmpeqb %xmm7, %xmm1 -; SSE2-NEXT: pxor %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm0, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm8 -; SSE2-NEXT: psrad $31, %xmm8 +; SSE2-NEXT: psraw $8, %xmm2 +; SSE2-NEXT: pmullw %xmm6, %xmm2 +; SSE2-NEXT: psrlw $8, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pcmpeqb %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm8, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm5 +; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm6 ; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm7, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: movdqa %xmm9, 16(%rsi) -; SSE2-NEXT: movdqa %xmm2, (%rsi) -; SSE2-NEXT: movdqa %xmm5, 112(%rdi) +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm7 +; SSE2-NEXT: psrad $31, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSE2-NEXT: psrad $24, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: movdqa %xmm10, 16(%rsi) +; SSE2-NEXT: movdqa %xmm11, (%rsi) +; SSE2-NEXT: movdqa %xmm1, 64(%rdi) +; SSE2-NEXT: movdqa %xmm3, (%rdi) +; SSE2-NEXT: movdqa %xmm7, 112(%rdi) ; SSE2-NEXT: movdqa %xmm4, 96(%rdi) -; SSE2-NEXT: movdqa %xmm3, 80(%rdi) -; SSE2-NEXT: movdqa %xmm7, 64(%rdi) -; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm6, 32(%rdi) -; SSE2-NEXT: movdqa %xmm0, 16(%rdi) -; SSE2-NEXT: movdqa %xmm8, (%rdi) +; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm0, 48(%rdi) +; SSE2-NEXT: movdqa %xmm5, 32(%rdi) +; SSE2-NEXT: movdqa %xmm2, 16(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: smulo_v32i8: ; SSSE3: # %bb.0: ; SSSE3-NEXT: movq %rdi, %rax -; SSSE3-NEXT: movdqa %xmm3, %xmm10 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; SSSE3-NEXT: movdqa %xmm3, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm1, %xmm9 +; SSSE3-NEXT: movdqa %xmm1, %xmm10 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: pmullw %xmm3, %xmm1 ; SSSE3-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSSE3-NEXT: pand %xmm8, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pmullw %xmm10, %xmm9 -; SSSE3-NEXT: pand %xmm8, %xmm9 -; SSSE3-NEXT: packuswb %xmm1, %xmm9 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pmullw %xmm9, %xmm10 +; SSSE3-NEXT: pand %xmm8, %xmm10 +; SSSE3-NEXT: packuswb %xmm1, %xmm10 ; SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSSE3-NEXT: pcmpgtb %xmm9, %xmm1 -; SSSE3-NEXT: psraw $8, %xmm7 +; SSSE3-NEXT: pcmpgtb %xmm10, %xmm1 +; SSSE3-NEXT: psraw $8, %xmm4 ; SSSE3-NEXT: psraw $8, %xmm6 -; SSSE3-NEXT: pmullw %xmm7, %xmm6 +; SSSE3-NEXT: pmullw %xmm4, %xmm6 ; SSSE3-NEXT: psrlw $8, %xmm6 -; SSSE3-NEXT: psraw $8, %xmm4 ; SSSE3-NEXT: psraw $8, %xmm5 -; SSSE3-NEXT: pmullw %xmm4, %xmm5 -; SSSE3-NEXT: psrlw $8, %xmm5 -; SSSE3-NEXT: packuswb %xmm6, %xmm5 -; SSSE3-NEXT: pcmpeqb %xmm1, %xmm5 -; SSSE3-NEXT: movdqa %xmm2, %xmm10 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15] +; SSSE3-NEXT: psraw $8, %xmm7 +; SSSE3-NEXT: pmullw %xmm5, %xmm7 +; SSSE3-NEXT: psrlw $8, %xmm7 +; SSSE3-NEXT: packuswb %xmm6, %xmm7 +; SSSE3-NEXT: pcmpeqb %xmm1, %xmm7 +; SSSE3-NEXT: movdqa %xmm2, %xmm9 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] -; SSSE3-NEXT: movdqa %xmm2, %xmm7 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm11 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: pmullw %xmm7, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pmullw %xmm10, %xmm2 -; SSSE3-NEXT: pxor %xmm7, %xmm7 +; SSSE3-NEXT: pmullw %xmm4, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pmullw %xmm9, %xmm11 +; SSSE3-NEXT: pxor %xmm4, %xmm4 ; SSSE3-NEXT: pand %xmm8, %xmm0 -; SSSE3-NEXT: pand %xmm8, %xmm2 +; SSSE3-NEXT: pand %xmm8, %xmm11 ; SSSE3-NEXT: pcmpeqd %xmm8, %xmm8 -; SSSE3-NEXT: pxor %xmm8, %xmm5 -; SSSE3-NEXT: packuswb %xmm0, %xmm2 -; SSSE3-NEXT: pcmpgtb %xmm2, %xmm7 -; SSSE3-NEXT: psraw $8, %xmm4 +; SSSE3-NEXT: pxor %xmm8, %xmm7 +; SSSE3-NEXT: packuswb %xmm0, %xmm11 +; SSSE3-NEXT: pcmpgtb %xmm11, %xmm4 +; SSSE3-NEXT: psraw $8, %xmm5 ; SSSE3-NEXT: psraw $8, %xmm3 -; SSSE3-NEXT: pmullw %xmm4, %xmm3 +; SSSE3-NEXT: pmullw %xmm5, %xmm3 ; SSSE3-NEXT: psrlw $8, %xmm3 ; SSSE3-NEXT: psraw $8, %xmm6 -; SSSE3-NEXT: psraw $8, %xmm1 -; SSSE3-NEXT: pmullw %xmm6, %xmm1 -; SSSE3-NEXT: psrlw $8, %xmm1 -; SSSE3-NEXT: packuswb %xmm3, %xmm1 -; SSSE3-NEXT: pcmpeqb %xmm7, %xmm1 -; SSSE3-NEXT: pxor %xmm8, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm0, %xmm8 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm8 -; SSSE3-NEXT: psrad $31, %xmm8 +; SSSE3-NEXT: psraw $8, %xmm2 +; SSSE3-NEXT: pmullw %xmm6, %xmm2 +; SSSE3-NEXT: psrlw $8, %xmm2 +; SSSE3-NEXT: packuswb %xmm3, %xmm2 +; SSSE3-NEXT: pcmpeqb %xmm4, %xmm2 +; SSSE3-NEXT: pxor %xmm8, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3],xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm0, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm5 +; SSSE3-NEXT: psrad $31, %xmm5 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm1, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: movdqa %xmm7, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm6 ; SSSE3-NEXT: psrad $31, %xmm6 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm5, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm3, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm5, %xmm4 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3],xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm7, %xmm4 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm5 -; SSSE3-NEXT: movdqa %xmm9, 16(%rsi) -; SSSE3-NEXT: movdqa %xmm2, (%rsi) -; SSSE3-NEXT: movdqa %xmm5, 112(%rdi) +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm7 +; SSSE3-NEXT: psrad $31, %xmm7 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; SSSE3-NEXT: psrad $24, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3] +; SSSE3-NEXT: psrad $24, %xmm1 +; SSSE3-NEXT: movdqa %xmm10, 16(%rsi) +; SSSE3-NEXT: movdqa %xmm11, (%rsi) +; SSSE3-NEXT: movdqa %xmm1, 64(%rdi) +; SSSE3-NEXT: movdqa %xmm3, (%rdi) +; SSSE3-NEXT: movdqa %xmm7, 112(%rdi) ; SSSE3-NEXT: movdqa %xmm4, 96(%rdi) -; SSSE3-NEXT: movdqa %xmm3, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) -; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm8, (%rdi) +; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm5, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: smulo_v32i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero -; SSE41-NEXT: pmovsxbw %xmm3, %xmm7 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmovsxbw %xmm3, %xmm10 ; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm3[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; SSE41-NEXT: pmovsxbw %xmm1, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm1[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm1[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm8, %xmm1 -; SSE41-NEXT: pmullw %xmm10, %xmm9 -; SSE41-NEXT: pand %xmm8, %xmm9 -; SSE41-NEXT: packuswb %xmm1, %xmm9 +; SSE41-NEXT: pmullw %xmm9, %xmm11 +; SSE41-NEXT: pand %xmm8, %xmm11 +; SSE41-NEXT: packuswb %xmm1, %xmm11 ; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm9, %xmm1 -; SSE41-NEXT: pmullw %xmm7, %xmm4 +; SSE41-NEXT: pcmpgtb %xmm11, %xmm1 +; SSE41-NEXT: pmullw %xmm10, %xmm4 ; SSE41-NEXT: psrlw $8, %xmm4 ; SSE41-NEXT: pmovsxbw %xmm5, %xmm3 -; SSE41-NEXT: pmovsxbw %xmm6, %xmm5 +; SSE41-NEXT: pmovsxbw %xmm7, %xmm5 ; SSE41-NEXT: pmullw %xmm3, %xmm5 ; SSE41-NEXT: psrlw $8, %xmm5 ; SSE41-NEXT: packuswb %xmm5, %xmm4 ; SSE41-NEXT: pcmpeqb %xmm1, %xmm4 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm10 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; SSE41-NEXT: pmovsxbw %xmm2, %xmm5 -; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm2[2,3,2,3] -; SSE41-NEXT: movdqa %xmm2, %xmm7 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero +; SSE41-NEXT: pmovsxbw %xmm2, %xmm10 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,2,3] +; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmovsxbw %xmm0, %xmm2 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmullw %xmm7, %xmm0 -; SSE41-NEXT: pmullw %xmm10, %xmm1 -; SSE41-NEXT: pxor %xmm7, %xmm7 +; SSE41-NEXT: pmullw %xmm3, %xmm0 +; SSE41-NEXT: pmullw %xmm9, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 ; SSE41-NEXT: pand %xmm8, %xmm0 ; SSE41-NEXT: pand %xmm8, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm8, %xmm8 ; SSE41-NEXT: pxor %xmm8, %xmm4 ; SSE41-NEXT: packuswb %xmm0, %xmm1 -; SSE41-NEXT: pcmpgtb %xmm1, %xmm7 -; SSE41-NEXT: pmullw %xmm5, %xmm2 +; SSE41-NEXT: pcmpgtb %xmm1, %xmm3 +; SSE41-NEXT: pmullw %xmm10, %xmm2 ; SSE41-NEXT: psrlw $8, %xmm2 -; SSE41-NEXT: pmovsxbw %xmm6, %xmm0 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm3 -; SSE41-NEXT: pmullw %xmm0, %xmm3 -; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm2 -; SSE41-NEXT: pcmpeqb %xmm7, %xmm2 +; SSE41-NEXT: pmovsxbw %xmm7, %xmm0 +; SSE41-NEXT: pmovsxbw %xmm5, %xmm5 +; SSE41-NEXT: pmullw %xmm0, %xmm5 +; SSE41-NEXT: psrlw $8, %xmm5 +; SSE41-NEXT: packuswb %xmm5, %xmm2 +; SSE41-NEXT: pcmpeqb %xmm3, %xmm2 ; SSE41-NEXT: pxor %xmm8, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm8 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -1803,35 +1795,31 @@ define <32 x i32> @smulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pslld $31, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[3,3,3,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm6 -; SSE41-NEXT: psrad $31, %xmm6 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[1,1,1,1] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm7 -; SSE41-NEXT: psrad $31, %xmm7 -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm4[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm7 +; SSE41-NEXT: psrad $31, %xmm7 +; SSE41-NEXT: pshufd {{.*#+}} xmm6 = xmm4[2,3,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm6 +; SSE41-NEXT: psrad $31, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: movdqa %xmm9, 16(%rsi) +; SSE41-NEXT: pmovsxbd %xmm2, %xmm2 +; SSE41-NEXT: pmovsxbd %xmm4, %xmm4 +; SSE41-NEXT: movdqa %xmm11, 16(%rsi) ; SSE41-NEXT: movdqa %xmm1, (%rsi) ; SSE41-NEXT: movdqa %xmm4, 64(%rdi) ; SSE41-NEXT: movdqa %xmm2, (%rdi) ; SSE41-NEXT: movdqa %xmm0, 112(%rdi) -; SSE41-NEXT: movdqa %xmm3, 96(%rdi) +; SSE41-NEXT: movdqa %xmm6, 96(%rdi) ; SSE41-NEXT: movdqa %xmm7, 80(%rdi) -; SSE41-NEXT: movdqa %xmm6, 48(%rdi) +; SSE41-NEXT: movdqa %xmm3, 48(%rdi) ; SSE41-NEXT: movdqa %xmm5, 32(%rdi) ; SSE41-NEXT: movdqa %xmm8, 16(%rdi) ; SSE41-NEXT: retq @@ -2151,86 +2139,82 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE2-NEXT: movdqa %xmm11, 32(%rsi) ; SSE2-NEXT: movdqa %xmm6, 16(%rsi) ; SSE2-NEXT: movdqa %xmm12, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm5, (%rsi) -; SSE2-NEXT: movdqa %xmm12, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm12 -; SSE2-NEXT: psrad $31, %xmm12 -; SSE2-NEXT: movdqa %xmm12, 224(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 192(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 128(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 64(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm4 +; SSE2-NEXT: movdqa %xmm4, (%rdi) +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm3, 224(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, 240(%rdi) +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm12 +; SSE2-NEXT: psrad $31, %xmm12 +; SSE2-NEXT: movdqa %xmm12, 208(%rdi) ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 192(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 208(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 160(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, 160(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: movdqa %xmm4, 176(%rdi) -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 128(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 144(%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 96(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 112(%rdi) -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 64(%rdi) ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 80(%rdi) -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 @@ -2366,86 +2350,82 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSSE3-NEXT: movdqa %xmm11, 32(%rsi) ; SSSE3-NEXT: movdqa %xmm6, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm12, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: movdqa %xmm5, (%rsi) -; SSSE3-NEXT: movdqa %xmm12, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm12 -; SSSE3-NEXT: psrad $31, %xmm12 -; SSSE3-NEXT: movdqa %xmm12, 224(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 192(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 128(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 64(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, (%rdi) +; SSSE3-NEXT: movdqa %xmm3, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, 224(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, 240(%rdi) +; SSSE3-NEXT: movdqa %xmm2, %xmm3 +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm12 +; SSSE3-NEXT: psrad $31, %xmm12 +; SSSE3-NEXT: movdqa %xmm12, 208(%rdi) ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 192(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 208(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 160(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, 160(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: movdqa %xmm4, 176(%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 128(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 96(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 64(%rdi) ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, 32(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, (%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 @@ -2455,32 +2435,32 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-LABEL: smulo_v64i8: ; SSE41: # %bb.0: ; SSE41-NEXT: movq %rdi, %rax -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero ; SSE41-NEXT: movdqa %xmm7, %xmm10 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm8 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: movdqa %xmm3, %xmm11 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm10, %xmm11 ; SSE41-NEXT: pmovsxbw %xmm7, %xmm12 -; SSE41-NEXT: pmullw %xmm8, %xmm9 +; SSE41-NEXT: pmullw %xmm9, %xmm8 ; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] ; SSE41-NEXT: pand %xmm10, %xmm11 -; SSE41-NEXT: pand %xmm10, %xmm9 -; SSE41-NEXT: packuswb %xmm11, %xmm9 -; SSE41-NEXT: pmovsxbw %xmm3, %xmm8 +; SSE41-NEXT: pand %xmm10, %xmm8 +; SSE41-NEXT: packuswb %xmm11, %xmm8 +; SSE41-NEXT: pmovsxbw %xmm3, %xmm9 ; SSE41-NEXT: pshufd {{.*#+}} xmm11 = xmm7[2,3,2,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm13 = xmm3[2,3,2,3] -; SSE41-NEXT: pmullw %xmm12, %xmm8 +; SSE41-NEXT: pmullw %xmm12, %xmm9 ; SSE41-NEXT: pxor %xmm7, %xmm7 -; SSE41-NEXT: pcmpgtb %xmm9, %xmm7 -; SSE41-NEXT: psrlw $8, %xmm8 +; SSE41-NEXT: pcmpgtb %xmm8, %xmm7 +; SSE41-NEXT: psrlw $8, %xmm9 ; SSE41-NEXT: pmovsxbw %xmm11, %xmm11 ; SSE41-NEXT: pmovsxbw %xmm13, %xmm3 ; SSE41-NEXT: pmullw %xmm11, %xmm3 ; SSE41-NEXT: psrlw $8, %xmm3 -; SSE41-NEXT: packuswb %xmm3, %xmm8 -; SSE41-NEXT: pcmpeqb %xmm7, %xmm8 +; SSE41-NEXT: packuswb %xmm3, %xmm9 +; SSE41-NEXT: pcmpeqb %xmm7, %xmm9 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm11 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero ; SSE41-NEXT: movdqa %xmm6, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] @@ -2557,41 +2537,33 @@ define <64 x i32> @smulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pcmpgtb %xmm5, %xmm0 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm8 +; SSE41-NEXT: pxor %xmm0, %xmm9 ; SSE41-NEXT: pxor %xmm0, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm2 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm9, 48(%rsi) +; SSE41-NEXT: movdqa %xmm8, 48(%rsi) ; SSE41-NEXT: movdqa %xmm12, 32(%rsi) ; SSE41-NEXT: movdqa %xmm11, 16(%rsi) ; SSE41-NEXT: movdqa %xmm5, (%rsi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm9, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 192(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 128(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm2, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 64(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm1, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 240(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm9[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -3018,10 +2990,8 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -3037,10 +3007,8 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pcmpeqw %xmm2, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm0, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -3056,9 +3024,7 @@ define <8 x i32> @smulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll index 20fca27ab2d4..b591f3b4b94e 100644 --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -544,12 +544,11 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: psrad $24, %xmm4 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -573,12 +572,11 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm2, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSSE3-NEXT: psrad $24, %xmm4 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -602,9 +600,7 @@ define <16 x i32> @ssubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm2, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -678,10 +674,8 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -697,10 +691,8 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm2 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm2, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm2 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -716,9 +708,7 @@ define <8 x i32> @ssubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm0, %xmm2 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm2, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm2 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll index 7cf566f7b3a1..836cfaf2d39b 100644 --- a/llvm/test/CodeGen/X86/vec_uaddo.ll +++ b/llvm/test/CodeGen/X86/vec_uaddo.ll @@ -626,12 +626,11 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 @@ -654,12 +653,11 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm1, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm4 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm4, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 @@ -682,9 +680,7 @@ define <16 x i32> @uaddo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm4 @@ -757,10 +753,8 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pxor %xmm3, %xmm2 ; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 @@ -776,10 +770,8 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pxor %xmm3, %xmm2 ; SSSE3-NEXT: pxor %xmm1, %xmm3 ; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 @@ -794,9 +786,7 @@ define <8 x i32> @uaddo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 ; SSE41-NEXT: pxor %xmm0, %xmm2 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm2, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm2 ; SSE41-NEXT: psrad $31, %xmm2 diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll index 5d29e20888a0..9075b83275a9 100644 --- a/llvm/test/CodeGen/X86/vec_umulo.ll +++ b/llvm/test/CodeGen/X86/vec_umulo.ll @@ -1104,12 +1104,11 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm2, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -1155,12 +1154,11 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm2, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -1199,9 +1197,7 @@ define <16 x i32> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm2, %xmm5 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm5, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -1335,11 +1331,11 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE2-NEXT: pand %xmm8, %xmm7 ; SSE2-NEXT: movdqa %xmm3, %xmm6 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pmullw %xmm6, %xmm10 -; SSE2-NEXT: pand %xmm8, %xmm10 -; SSE2-NEXT: packuswb %xmm7, %xmm10 +; SSE2-NEXT: movdqa %xmm1, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pmullw %xmm6, %xmm11 +; SSE2-NEXT: pand %xmm8, %xmm11 +; SSE2-NEXT: packuswb %xmm7, %xmm11 ; SSE2-NEXT: pxor %xmm6, %xmm6 ; SSE2-NEXT: movdqa %xmm3, %xmm7 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] @@ -1369,49 +1365,47 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm6, %xmm0 ; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm7 -; SSE2-NEXT: psrad $31, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: movdqa %xmm2, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSE2-NEXT: pslld $31, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm5 -; SSE2-NEXT: psrad $31, %xmm5 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm10, 16(%rsi) +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSE2-NEXT: psrad $24, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm11, 16(%rsi) ; SSE2-NEXT: movdqa %xmm9, (%rsi) +; SSE2-NEXT: movdqa %xmm5, 64(%rdi) +; SSE2-NEXT: movdqa %xmm7, (%rdi) ; SSE2-NEXT: movdqa %xmm1, 112(%rdi) -; SSE2-NEXT: movdqa %xmm5, 96(%rdi) -; SSE2-NEXT: movdqa %xmm4, 80(%rdi) -; SSE2-NEXT: movdqa %xmm7, 64(%rdi) -; SSE2-NEXT: movdqa %xmm0, 48(%rdi) -; SSE2-NEXT: movdqa %xmm6, 32(%rdi) -; SSE2-NEXT: movdqa %xmm2, 16(%rdi) -; SSE2-NEXT: movdqa %xmm3, (%rdi) +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) +; SSE2-NEXT: movdqa %xmm6, 80(%rdi) +; SSE2-NEXT: movdqa %xmm2, 48(%rdi) +; SSE2-NEXT: movdqa %xmm4, 32(%rdi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdi) ; SSE2-NEXT: retq ; ; SSSE3-LABEL: umulo_v32i8: @@ -1439,11 +1433,11 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSSE3-NEXT: pand %xmm8, %xmm7 ; SSSE3-NEXT: movdqa %xmm3, %xmm6 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm10 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pmullw %xmm6, %xmm10 -; SSSE3-NEXT: pand %xmm8, %xmm10 -; SSSE3-NEXT: packuswb %xmm7, %xmm10 +; SSSE3-NEXT: movdqa %xmm1, %xmm11 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pmullw %xmm6, %xmm11 +; SSSE3-NEXT: pand %xmm8, %xmm11 +; SSSE3-NEXT: packuswb %xmm7, %xmm11 ; SSSE3-NEXT: pxor %xmm6, %xmm6 ; SSSE3-NEXT: movdqa %xmm3, %xmm7 ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm6[8],xmm7[9],xmm6[9],xmm7[10],xmm6[10],xmm7[11],xmm6[11],xmm7[12],xmm6[12],xmm7[13],xmm6[13],xmm7[14],xmm6[14],xmm7[15],xmm6[15] @@ -1473,49 +1467,47 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm6, %xmm0 ; SSSE3-NEXT: pxor %xmm3, %xmm0 ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm0, %xmm6 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm6 -; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm1, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm4, %xmm7 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm7 -; SSSE3-NEXT: psrad $31, %xmm7 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: movdqa %xmm2, %xmm4 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm4 ; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm1, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7] +; SSSE3-NEXT: pslld $31, %xmm6 +; SSSE3-NEXT: psrad $31, %xmm6 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1],xmm10[2],xmm1[2],xmm10[3],xmm1[3],xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] ; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: movdqa %xmm1, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm5 -; SSSE3-NEXT: psrad $31, %xmm5 +; SSSE3-NEXT: movdqa %xmm1, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm10, 16(%rsi) +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3] +; SSSE3-NEXT: psrad $24, %xmm7 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm11, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm9, (%rsi) +; SSSE3-NEXT: movdqa %xmm5, 64(%rdi) +; SSSE3-NEXT: movdqa %xmm7, (%rdi) ; SSSE3-NEXT: movdqa %xmm1, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm5, 96(%rdi) -; SSSE3-NEXT: movdqa %xmm4, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm7, 64(%rdi) -; SSSE3-NEXT: movdqa %xmm0, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm6, 32(%rdi) -; SSSE3-NEXT: movdqa %xmm2, 16(%rdi) -; SSSE3-NEXT: movdqa %xmm3, (%rdi) +; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) +; SSSE3-NEXT: movdqa %xmm6, 80(%rdi) +; SSSE3-NEXT: movdqa %xmm2, 48(%rdi) +; SSSE3-NEXT: movdqa %xmm4, 32(%rdi) +; SSSE3-NEXT: movdqa %xmm0, 16(%rdi) ; SSSE3-NEXT: retq ; ; SSE41-LABEL: umulo_v32i8: @@ -1526,42 +1518,42 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm4, %xmm7 -; SSE41-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm9, %xmm7 +; SSE41-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255] +; SSE41-NEXT: pand %xmm10, %xmm7 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; SSE41-NEXT: pmullw %xmm5, %xmm6 ; SSE41-NEXT: movdqa %xmm6, %xmm8 -; SSE41-NEXT: pand %xmm9, %xmm8 +; SSE41-NEXT: pand %xmm10, %xmm8 ; SSE41-NEXT: packuswb %xmm7, %xmm8 ; SSE41-NEXT: movdqa %xmm3, %xmm7 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: movdqa %xmm1, %xmm5 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm7, %xmm5 -; SSE41-NEXT: pand %xmm9, %xmm5 -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero +; SSE41-NEXT: pand %xmm10, %xmm5 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm9 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; SSE41-NEXT: pmullw %xmm4, %xmm7 -; SSE41-NEXT: pand %xmm7, %xmm9 -; SSE41-NEXT: packuswb %xmm5, %xmm9 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15] +; SSE41-NEXT: pmullw %xmm9, %xmm7 +; SSE41-NEXT: pand %xmm7, %xmm10 +; SSE41-NEXT: packuswb %xmm5, %xmm10 +; SSE41-NEXT: pxor %xmm5, %xmm5 +; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15] ; SSE41-NEXT: pmullw %xmm3, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm1 ; SSE41-NEXT: psrlw $8, %xmm7 ; SSE41-NEXT: packuswb %xmm1, %xmm7 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm7 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm7 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm1, %xmm7 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15] +; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15] ; SSE41-NEXT: pmullw %xmm2, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm0 ; SSE41-NEXT: psrlw $8, %xmm6 ; SSE41-NEXT: packuswb %xmm0, %xmm6 -; SSE41-NEXT: pcmpeqb %xmm4, %xmm6 +; SSE41-NEXT: pcmpeqb %xmm5, %xmm6 ; SSE41-NEXT: pxor %xmm1, %xmm6 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero @@ -1579,26 +1571,22 @@ define <32 x i32> @umulo_v32i8(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm3 ; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[2,3,2,3] -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm4 -; SSE41-NEXT: psrad $31, %xmm4 -; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[3,3,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm5 = xmm7[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm5 ; SSE41-NEXT: psrad $31, %xmm5 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm6 -; SSE41-NEXT: psrad $31, %xmm6 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm7 -; SSE41-NEXT: psrad $31, %xmm7 -; SSE41-NEXT: movdqa %xmm9, 16(%rsi) +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm7[3,3,3,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; SSE41-NEXT: pslld $31, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: pmovsxbd %xmm6, %xmm6 +; SSE41-NEXT: pmovsxbd %xmm7, %xmm7 +; SSE41-NEXT: movdqa %xmm10, 16(%rsi) ; SSE41-NEXT: movdqa %xmm8, (%rsi) ; SSE41-NEXT: movdqa %xmm7, 64(%rdi) ; SSE41-NEXT: movdqa %xmm6, (%rdi) -; SSE41-NEXT: movdqa %xmm5, 112(%rdi) -; SSE41-NEXT: movdqa %xmm4, 96(%rdi) +; SSE41-NEXT: movdqa %xmm4, 112(%rdi) +; SSE41-NEXT: movdqa %xmm5, 96(%rdi) ; SSE41-NEXT: movdqa %xmm3, 80(%rdi) ; SSE41-NEXT: movdqa %xmm2, 48(%rdi) ; SSE41-NEXT: movdqa %xmm1, 32(%rdi) @@ -1872,86 +1860,82 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE2-NEXT: movdqa %xmm10, 32(%rsi) ; SSE2-NEXT: movdqa %xmm9, 16(%rsi) ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: movdqa %xmm8, (%rsi) -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 224(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 192(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 128(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm5, 64(%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm5 +; SSE2-NEXT: movdqa %xmm5, (%rdi) +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 224(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 ; SSE2-NEXT: movdqa %xmm5, 240(%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 192(%rdi) ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: movdqa %xmm3, 208(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, 160(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm4, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: movdqa %xmm4, 160(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: movdqa %xmm3, 176(%rdi) -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: movdqa %xmm4, 128(%rdi) ; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 144(%rdi) -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: movdqa %xmm1, 96(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm3, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: movdqa %xmm3, 96(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 ; SSE2-NEXT: movdqa %xmm2, 112(%rdi) -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: movdqa %xmm3, 64(%rdi) ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 80(%rdi) -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdi) -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSE2-NEXT: pslld $31, %xmm2 +; SSE2-NEXT: psrad $31, %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rdi) ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: movdqa %xmm1, 48(%rdi) -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: movdqa %xmm2, (%rdi) +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 @@ -2072,86 +2056,82 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSSE3-NEXT: movdqa %xmm10, 32(%rsi) ; SSSE3-NEXT: movdqa %xmm9, 16(%rsi) ; SSSE3-NEXT: movdqa %xmm3, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSSE3-NEXT: movdqa %xmm8, (%rsi) -; SSSE3-NEXT: movdqa %xmm3, %xmm5 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 224(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 192(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 128(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, 64(%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm5 +; SSSE3-NEXT: movdqa %xmm5, (%rdi) +; SSSE3-NEXT: movdqa %xmm4, %xmm5 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 224(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm5 ; SSSE3-NEXT: psrad $31, %xmm5 ; SSSE3-NEXT: movdqa %xmm5, 240(%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 192(%rdi) ; SSSE3-NEXT: movdqa %xmm2, %xmm4 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, 208(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, 160(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm4, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm4 +; SSSE3-NEXT: psrad $31, %xmm4 +; SSSE3-NEXT: movdqa %xmm4, 160(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm3 ; SSSE3-NEXT: psrad $31, %xmm3 ; SSSE3-NEXT: movdqa %xmm3, 176(%rdi) -; SSSE3-NEXT: movdqa %xmm4, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm4 -; SSSE3-NEXT: psrad $31, %xmm4 -; SSSE3-NEXT: movdqa %xmm4, 128(%rdi) ; SSSE3-NEXT: movdqa %xmm1, %xmm3 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 144(%rdi) -; SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, 96(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm3, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm3 +; SSSE3-NEXT: psrad $31, %xmm3 +; SSSE3-NEXT: movdqa %xmm3, 96(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 ; SSSE3-NEXT: movdqa %xmm2, 112(%rdi) -; SSSE3-NEXT: movdqa %xmm3, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm3 -; SSSE3-NEXT: psrad $31, %xmm3 -; SSSE3-NEXT: movdqa %xmm3, 64(%rdi) ; SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 80(%rdi) -; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: movdqa %xmm0, 32(%rdi) -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSSE3-NEXT: movdqa %xmm2, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: pslld $31, %xmm2 +; SSSE3-NEXT: psrad $31, %xmm2 +; SSSE3-NEXT: movdqa %xmm2, 32(%rdi) ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: movdqa %xmm1, 48(%rdi) -; SSSE3-NEXT: movdqa %xmm2, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm2 -; SSSE3-NEXT: psrad $31, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, (%rdi) +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 @@ -2200,27 +2180,27 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm12 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; SSE41-NEXT: pmullw %xmm14, %xmm12 -; SSE41-NEXT: movdqa %xmm12, %xmm15 -; SSE41-NEXT: pand %xmm10, %xmm15 -; SSE41-NEXT: packuswb %xmm0, %xmm15 +; SSE41-NEXT: movdqa %xmm12, %xmm14 +; SSE41-NEXT: pand %xmm10, %xmm14 +; SSE41-NEXT: packuswb %xmm0, %xmm14 ; SSE41-NEXT: movdqa %xmm7, %xmm0 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: movdqa %xmm4, %xmm3 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE41-NEXT: pmullw %xmm0, %xmm3 ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero -; SSE41-NEXT: pmovzxbw {{.*#+}} xmm14 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero -; SSE41-NEXT: pmullw %xmm0, %xmm14 +; SSE41-NEXT: pmovzxbw {{.*#+}} xmm15 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; SSE41-NEXT: pmullw %xmm0, %xmm15 ; SSE41-NEXT: pand %xmm10, %xmm3 -; SSE41-NEXT: pand %xmm14, %xmm10 +; SSE41-NEXT: pand %xmm15, %xmm10 ; SSE41-NEXT: packuswb %xmm3, %xmm10 ; SSE41-NEXT: pxor %xmm0, %xmm0 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] ; SSE41-NEXT: pmullw %xmm7, %xmm4 ; SSE41-NEXT: psrlw $8, %xmm4 -; SSE41-NEXT: psrlw $8, %xmm14 -; SSE41-NEXT: packuswb %xmm4, %xmm14 +; SSE41-NEXT: psrlw $8, %xmm15 +; SSE41-NEXT: packuswb %xmm4, %xmm15 ; SSE41-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] ; SSE41-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] ; SSE41-NEXT: pmullw %xmm6, %xmm2 @@ -2239,48 +2219,40 @@ define <64 x i32> @umulo_v64i8(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %p2) nou ; SSE41-NEXT: psrlw $8, %xmm5 ; SSE41-NEXT: psrlw $8, %xmm8 ; SSE41-NEXT: packuswb %xmm5, %xmm8 -; SSE41-NEXT: pcmpeqb %xmm0, %xmm14 +; SSE41-NEXT: pcmpeqb %xmm0, %xmm15 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm12 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm9 ; SSE41-NEXT: pcmpeqb %xmm0, %xmm8 ; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 -; SSE41-NEXT: pxor %xmm0, %xmm14 +; SSE41-NEXT: pxor %xmm0, %xmm15 ; SSE41-NEXT: pxor %xmm0, %xmm12 ; SSE41-NEXT: pxor %xmm0, %xmm9 ; SSE41-NEXT: pxor %xmm0, %xmm8 ; SSE41-NEXT: movdqa %xmm10, 48(%rsi) -; SSE41-NEXT: movdqa %xmm15, 32(%rsi) +; SSE41-NEXT: movdqa %xmm14, 32(%rsi) ; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE41-NEXT: movaps %xmm0, 16(%rsi) ; SSE41-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; SSE41-NEXT: movaps %xmm0, (%rsi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm15, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 192(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm12, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 128(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm9, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 64(%rdi) -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm8, %xmm0 ; SSE41-NEXT: movdqa %xmm0, (%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,3,2,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,3,2,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 224(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[3,3,3,3] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, 240(%rdi) -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm0 ; SSE41-NEXT: psrad $31, %xmm0 @@ -2655,10 +2627,8 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: pcmpeqw %xmm0, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -2674,10 +2644,8 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: pcmpeqw %xmm0, %xmm3 ; SSSE3-NEXT: pcmpeqd %xmm1, %xmm1 ; SSSE3-NEXT: pxor %xmm3, %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSSE3-NEXT: psrad $16, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -2693,9 +2661,7 @@ define <8 x i32> @umulo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm0, %xmm3 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm3, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll index dde6832d6482..4e2906c3d5e3 100644 --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -670,12 +670,11 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE2-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 ; SSE2-NEXT: movdqa %xmm3, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -698,12 +697,11 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSSE3-NEXT: pcmpeqb %xmm4, %xmm0 ; SSSE3-NEXT: pcmpeqd %xmm3, %xmm3 ; SSSE3-NEXT: pxor %xmm0, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 ; SSSE3-NEXT: movdqa %xmm3, %xmm1 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm0 -; SSSE3-NEXT: psrad $31, %xmm0 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 @@ -726,9 +724,7 @@ define <16 x i32> @usubo_v16i8(<16 x i8> %a0, <16 x i8> %a1, <16 x i8>* %p2) nou ; SSE41-NEXT: pcmpeqb %xmm4, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm3, %xmm3 ; SSE41-NEXT: pxor %xmm0, %xmm3 -; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxbd %xmm3, %xmm0 ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1] ; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero ; SSE41-NEXT: pslld $31, %xmm1 @@ -800,10 +796,8 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE2-NEXT: psubw %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm0, %xmm2 ; SSE2-NEXT: pcmpgtw %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pslld $31, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSE2-NEXT: pslld $31, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 @@ -820,10 +814,8 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSSE3-NEXT: psubw %xmm1, %xmm0 ; SSSE3-NEXT: pxor %xmm0, %xmm2 ; SSSE3-NEXT: pcmpgtw %xmm3, %xmm2 -; SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: pslld $31, %xmm1 -; SSSE3-NEXT: psrad $31, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSSE3-NEXT: psrad $16, %xmm1 ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7] ; SSSE3-NEXT: pslld $31, %xmm2 ; SSSE3-NEXT: psrad $31, %xmm2 @@ -840,9 +832,7 @@ define <8 x i32> @usubo_v8i16(<8 x i16> %a0, <8 x i16> %a1, <8 x i16>* %p2) noun ; SSE41-NEXT: pcmpeqw %xmm2, %xmm0 ; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 ; SSE41-NEXT: pxor %xmm0, %xmm1 -; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; SSE41-NEXT: pslld $31, %xmm0 -; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: pmovsxwd %xmm1, %xmm0 ; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] ; SSE41-NEXT: pslld $31, %xmm1 ; SSE41-NEXT: psrad $31, %xmm1 diff --git a/llvm/test/CodeGen/X86/vsplit-and.ll b/llvm/test/CodeGen/X86/vsplit-and.ll index aa043ed67f69..e1a99d209acf 100644 --- a/llvm/test/CodeGen/X86/vsplit-and.ll +++ b/llvm/test/CodeGen/X86/vsplit-and.ll @@ -41,16 +41,13 @@ define void @t2(<3 x i64>* %dst, <3 x i64> %src1, <3 x i64> %src2) nounwind read ; CHECK-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; CHECK-NEXT: xorps %xmm0, %xmm1 ; CHECK-NEXT: andnps %xmm1, %xmm2 -; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,3,3] ; CHECK-NEXT: psllq $63, %xmm0 ; CHECK-NEXT: psrad $31, %xmm0 ; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,3,3] -; CHECK-NEXT: psllq $63, %xmm1 -; CHECK-NEXT: psrad $31, %xmm1 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; CHECK-NEXT: movq %xmm1, 16(%rdi) -; CHECK-NEXT: movdqa %xmm0, (%rdi) +; CHECK-NEXT: pmovsxdq %xmm2, %xmm1 +; CHECK-NEXT: movdqa %xmm1, (%rdi) +; CHECK-NEXT: movq %xmm0, 16(%rdi) ; CHECK-NEXT: retq %cmp1 = icmp ne <3 x i64> %src1, zeroinitializer %cmp2 = icmp ne <3 x i64> %src2, zeroinitializer -- GitLab From 1a6ab32f3365b695d5f8397b5745b7fe6e86722d Mon Sep 17 00:00:00 2001 From: Nigel Perks Date: Wed, 10 Mar 2021 15:29:40 +0000 Subject: [PATCH 0047/1000] [XCore] Remove XFAIL: xcore from passing test. The pass can be seen on staging buildbot clang-xcore-ubuntu-20-x64. Differential Revision: https://reviews.llvm.org/D98352 --- llvm/test/CodeGen/Generic/2014-02-05-OpaqueConstants.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/CodeGen/Generic/2014-02-05-OpaqueConstants.ll b/llvm/test/CodeGen/Generic/2014-02-05-OpaqueConstants.ll index 1497bbb0c970..3ca31efd8881 100644 --- a/llvm/test/CodeGen/Generic/2014-02-05-OpaqueConstants.ll +++ b/llvm/test/CodeGen/Generic/2014-02-05-OpaqueConstants.ll @@ -1,6 +1,6 @@ ; Test that opaque constants are not creating an infinite DAGCombine loop ; RUN: llc < %s -; XFAIL: r600, xcore +; XFAIL: r600 @a = common global i32* null, align 8 @c = common global i32 0, align 4 -- GitLab From 168b206cd8efb59438a1db65d8e1639ae2c9f662 Mon Sep 17 00:00:00 2001 From: Asher Mancinelli Date: Thu, 18 Mar 2021 15:50:43 +0000 Subject: [PATCH 0048/1000] [flang] Unittests for runtime terminator Create test fixture for runtime tests which enables verification of failure cases. Test some runtime IO APIs for failure cases. Support testing efforts in D98303. Expand on effort discussed in D98601. Reviewed By: awarzynski Differential Revision: https://reviews.llvm.org/D98652 --- flang/unittests/RuntimeGTest/CMakeLists.txt | 2 + .../RuntimeGTest/CrashHandlerFixture.cpp | 34 ++++ .../RuntimeGTest/CrashHandlerFixture.h | 21 +++ .../RuntimeGTest/RuntimeCrashTest.cpp | 157 ++++++++++++++++++ 4 files changed, 214 insertions(+) create mode 100644 flang/unittests/RuntimeGTest/CrashHandlerFixture.cpp create mode 100644 flang/unittests/RuntimeGTest/CrashHandlerFixture.h create mode 100644 flang/unittests/RuntimeGTest/RuntimeCrashTest.cpp diff --git a/flang/unittests/RuntimeGTest/CMakeLists.txt b/flang/unittests/RuntimeGTest/CMakeLists.txt index 77aff3069f14..f26cb44be5fe 100644 --- a/flang/unittests/RuntimeGTest/CMakeLists.txt +++ b/flang/unittests/RuntimeGTest/CMakeLists.txt @@ -1,5 +1,7 @@ add_flang_unittest(FlangRuntimeTests CharacterTest.cpp + RuntimeCrashTest.cpp + CrashHandlerFixture.cpp ) target_link_libraries(FlangRuntimeTests diff --git a/flang/unittests/RuntimeGTest/CrashHandlerFixture.cpp b/flang/unittests/RuntimeGTest/CrashHandlerFixture.cpp new file mode 100644 index 000000000000..315a555789e7 --- /dev/null +++ b/flang/unittests/RuntimeGTest/CrashHandlerFixture.cpp @@ -0,0 +1,34 @@ +//===-- flang/unittests/RuntimeGTest/CrashHandlerFixture.cpp ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "CrashHandlerFixture.h" +#include "../../runtime/terminator.h" +#include +#include + +// Replaces Fortran runtime's crash handler so we can verify the crash message +[[noreturn]] static void CatchCrash( + const char *sourceFile, int sourceLine, const char *message, va_list &ap) { + char buffer[1000]; + std::vsnprintf(buffer, sizeof buffer, message, ap); + va_end(ap); + llvm::errs() + << "Test " + << ::testing::UnitTest::GetInstance()->current_test_info()->name() + << " crashed in file " + << (sourceFile ? sourceFile : "unknown source file") << '(' << sourceLine + << "): " << buffer << '\n'; + std::exit(EXIT_FAILURE); +} + +// Register the crash handler above when creating each unit test in this suite +void CrashHandlerFixture::SetUp() { + static bool isCrashHanlderRegistered{false}; + if (not isCrashHanlderRegistered) + Fortran::runtime::Terminator::RegisterCrashHandler(CatchCrash); + isCrashHanlderRegistered = true; +} diff --git a/flang/unittests/RuntimeGTest/CrashHandlerFixture.h b/flang/unittests/RuntimeGTest/CrashHandlerFixture.h new file mode 100644 index 000000000000..d368c6fb55ba --- /dev/null +++ b/flang/unittests/RuntimeGTest/CrashHandlerFixture.h @@ -0,0 +1,21 @@ +//===-- flang/unittests/RuntimeGTest/CrashHandlerFixture.h ------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// Test fixture registers a custom crash handler to ensure death tests fail +/// with expected message. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_FLANG_UNITTESTS_RUNTIMEGTEST_CRASHHANDLERFIXTURE_H +#define LLVM_FLANG_UNITTESTS_RUNTIMEGTEST_CRASHHANDLERFIXTURE_H +#include + +struct CrashHandlerFixture : testing::Test { + void SetUp(); +}; + +#endif diff --git a/flang/unittests/RuntimeGTest/RuntimeCrashTest.cpp b/flang/unittests/RuntimeGTest/RuntimeCrashTest.cpp new file mode 100644 index 000000000000..c8945409c8c7 --- /dev/null +++ b/flang/unittests/RuntimeGTest/RuntimeCrashTest.cpp @@ -0,0 +1,157 @@ +//===-- flang/unittests/RuntimeGTest/CrashHandlerFixture.cpp ----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// Selected APIs are tested here to support development of unit tests for other +/// runtime components and ensure the test fixture handles crashes as we expect. +// +//===----------------------------------------------------------------------===// +#include "CrashHandlerFixture.h" +#include "../../runtime/io-api.h" +#include "../../runtime/terminator.h" +#include + +using namespace Fortran::runtime; +using namespace Fortran::runtime::io; + +//------------------------------------------------------------------------------ +/// Test crashes through direct calls to terminator methods +//------------------------------------------------------------------------------ +struct TestTerminator : CrashHandlerFixture {}; + +#define TEST_CRASH_HANDLER_MESSAGE \ + "Intentionally crashing runtime for unit test" + +TEST(TestTerminator, CrashTest) { + static Fortran::runtime::Terminator t; + ASSERT_DEATH(t.Crash(TEST_CRASH_HANDLER_MESSAGE), TEST_CRASH_HANDLER_MESSAGE); +} + +#undef TEST_CRASH_HANDLER_MESSAGE + +TEST(TestTerminator, CheckFailedLocationTest) { + static Fortran::runtime::Terminator t; + ASSERT_DEATH(t.CheckFailed("predicate", "someFileName", 789), + "RUNTIME_CHECK\\(predicate\\) failed at someFileName\\(789\\)"); +} + +TEST(TestTerminator, CheckFailedTest) { + static Fortran::runtime::Terminator t; + ASSERT_DEATH(t.CheckFailed("predicate"), + "RUNTIME_CHECK\\(predicate\\) failed at \\(null\\)\\(0\\)"); +} + +//------------------------------------------------------------------------------ +/// Test misuse of io api +//------------------------------------------------------------------------------ +struct TestIOCrash : CrashHandlerFixture {}; + +TEST(TestIOCrash, FormatDescriptorWriteMismatchTest) { + static constexpr int bufferSize{4}; + static char buffer[bufferSize]; + static const char *format{"(A4)"}; + auto *cookie{IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format))}; + ASSERT_DEATH(IONAME(OutputInteger64)(cookie, 0xfeedface), + "Data edit descriptor 'A' may not be used with an INTEGER data item"); +} + +TEST(TestIOCrash, InvalidFormatCharacterTest) { + static constexpr int bufferSize{1}; + static char buffer[bufferSize]; + static const char *format{"(C1)"}; + auto *cookie{IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format))}; + ASSERT_DEATH(IONAME(OutputInteger64)(cookie, 0xfeedface), + "Unknown 'C' edit descriptor in FORMAT"); +} + +//------------------------------------------------------------------------------ +/// Test buffer overwrites with Output* functions +/// Each test performs the tested IO operation correctly first, before causing +/// an overwrite to demonstrate that the failure is caused by the overwrite and +/// not a misuse of the API. +//------------------------------------------------------------------------------ +TEST(TestIOCrash, OverwriteBufferAsciiTest) { + static constexpr int bufferSize{4}; + static char buffer[bufferSize]; + static const char *format{"(A4)"}; + auto *cookie{IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format))}; + IONAME(OutputAscii)(cookie, "four", bufferSize); + ASSERT_DEATH(IONAME(OutputAscii)(cookie, "Too many characters!", 20), + "Internal write overran available records"); +} + +TEST(TestIOCrash, OverwriteBufferCharacterTest) { + static constexpr int bufferSize{1}; + static char buffer[bufferSize]; + static const char *format{"(A1)"}; + auto *cookie{IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format))}; + IONAME(OutputCharacter)(cookie, "a", 1); + ASSERT_DEATH(IONAME(OutputCharacter)(cookie, "a", 1), + "Internal write overran available records"); +} + +TEST(TestIOCrash, OverwriteBufferLogicalTest) { + static constexpr int bufferSize{1}; + static char buffer[bufferSize]; + static const char *format{"(L1)"}; + auto *cookie{IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format))}; + IONAME(OutputLogical)(cookie, true); + ASSERT_DEATH(IONAME(OutputLogical)(cookie, true), + "Internal write overran available records"); +} + +TEST(TestIOCrash, OverwriteBufferRealTest) { + static constexpr int bufferSize{1}; + static char buffer[bufferSize]; + static const char *format{"(F1)"}; + auto *cookie{IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format))}; + IONAME(OutputReal32)(cookie, 1.); + EXPECT_DEATH(IONAME(OutputReal32)(cookie, 1.), + "Internal write overran available records"); + + std::memset(buffer, '\0', bufferSize); + cookie = IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format)); + IONAME(OutputReal64)(cookie, 1.); + EXPECT_DEATH(IONAME(OutputReal64)(cookie, 1.), + "Internal write overran available records"); +} + +TEST(TestIOCrash, OverwriteBufferComplexTest) { + static constexpr int bufferSize{8}; + static char buffer[bufferSize]; + static const char *format{"(Z1,Z1)"}; + auto *cookie{IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format))}; + IONAME(OutputComplex32)(cookie, 1., 1.); + EXPECT_DEATH(IONAME(OutputComplex32)(cookie, 1., 1.), + "Internal write overran available records"); + + std::memset(buffer, '\0', bufferSize); + cookie = IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format)); + IONAME(OutputComplex64)(cookie, 1., 1.); + EXPECT_DEATH(IONAME(OutputComplex64)(cookie, 1., 1.), + "Internal write overran available records"); +} + +TEST(TestIOCrash, OverwriteBufferIntegerTest) { + static constexpr int bufferSize{1}; + static char buffer[bufferSize]; + static const char *format{"(I1)"}; + auto *cookie{IONAME(BeginInternalFormattedOutput)( + buffer, bufferSize, format, std::strlen(format))}; + IONAME(OutputInteger64)(cookie, 0xdeadbeef); + ASSERT_DEATH(IONAME(OutputInteger64)(cookie, 0xdeadbeef), + "Internal write overran available records"); +} -- GitLab From 253f804debb3424470b2ed27f3c812ead908d4ca Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 18 Mar 2021 15:56:39 +0000 Subject: [PATCH 0049/1000] [amdgpu] Update med3 combine to skip i64 [amdgpu] Update med3 combine to skip i64 Fixes an assumption that a type which is not i32 will be i16. This asserts when trying to sign/zero extend an i64 to i32. Test case was cut down from an openmp application. Variations on it are hit by other combines before reaching the problematic one, e.g. replacing the immediate values with other function arguments changes the codegen path and misses this combine. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D98872 --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 18 +++++++++++------- llvm/test/CodeGen/AMDGPU/smed3.ll | 13 +++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index eb5b06e5a46b..124f7449bc27 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9772,15 +9772,19 @@ SDValue SITargetLowering::performIntMed3ImmCombine( } // If there isn't a 16-bit med3 operation, convert to 32-bit. - MVT NVT = MVT::i32; - unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; + if (VT == MVT::i16) { + MVT NVT = MVT::i32; + unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; - SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); - SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); - SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); + SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0)); + SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1)); + SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1); - SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); - return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); + SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3); + return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); + } + + return SDValue(); } static ConstantFPSDNode *getSplatConstantFP(SDValue Op) { diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll index 16aff2edba95..494510430ad5 100644 --- a/llvm/test/CodeGen/AMDGPU/smed3.ll +++ b/llvm/test/CodeGen/AMDGPU/smed3.ll @@ -80,6 +80,19 @@ define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 ad ret void } +; Regression test for performIntMed3ImmCombine extending arguments to 32 bit +; which failed for 64 bit arguments. Previously asserted / crashed. +; GCN-LABEL: {{^}}test_intMed3ImmCombine_no_32bit_extend: +; GCN: v_cmp_lt_i64 +; GCN: v_cmp_gt_i64 +define i64 @test_intMed3ImmCombine_no_32bit_extend(i64 %x) { + %smax = call i64 @llvm.smax.i64(i64 %x, i64 -2) + %smin = call i64 @llvm.smin.i64(i64 %smax, i64 2) + ret i64 %smin +} +declare i64 @llvm.smax.i64(i64, i64) +declare i64 @llvm.smin.i64(i64, i64) + ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16: ; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17 -- GitLab From 3f37c2823072b718d2690e0eb5edc992d78bd9da Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 17 Mar 2021 12:00:49 -0700 Subject: [PATCH 0050/1000] [AMDGPU] Remove unused template parameters of MUBUF_Real_AllAddr_vi Differential Revision: https://reviews.llvm.org/D98804 --- llvm/lib/Target/AMDGPU/BUFInstructions.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index e7930ba151c1..6a760bac311b 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -2303,7 +2303,7 @@ multiclass MUBUF_Real_vi_gfx90a op, MUBUF_Pseudo ps> { def _gfx90a : MUBUF_Real_gfx90a; } -multiclass MUBUF_Real_AllAddr_vi op, bit isAtomic = 0, bit isAtomicRet = 0> { +multiclass MUBUF_Real_AllAddr_vi op> { defm _OFFSET : MUBUF_Real_vi_gfx90a (NAME#"_OFFSET")>; defm _OFFEN : MUBUF_Real_vi_gfx90a (NAME#"_OFFEN")>; defm _IDXEN : MUBUF_Real_vi_gfx90a (NAME#"_IDXEN")>; @@ -2379,7 +2379,7 @@ multiclass MUBUF_Real_AllAddr_gfx80 op> { } multiclass MUBUF_Real_Atomic_vi op> : - MUBUF_Real_AllAddr_vi { + MUBUF_Real_AllAddr_vi { defm _OFFSET_RTN : MUBUF_Real_vi_gfx90a (NAME#"_OFFSET_RTN")>; defm _OFFEN_RTN : MUBUF_Real_vi_gfx90a (NAME#"_OFFEN_RTN")>; defm _IDXEN_RTN : MUBUF_Real_vi_gfx90a (NAME#"_IDXEN_RTN")>; -- GitLab From ced7256778699639d37cfa70ff65b4fcbdf62ebc Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Wed, 17 Mar 2021 17:37:59 -0700 Subject: [PATCH 0051/1000] [libsupport] Silence a bogus valgrind warning. Valgrind is reporting this bogus warning because it doesn't model pthread_sigmask fully accurately. This is a valgrind bug, but silencing it has effectively no cost, so just do it. ==73662== Syscall param __pthread_sigmask(set) points to uninitialised byte(s) ==73662== at 0x101E9D4C2: __pthread_sigmask (in /usr/lib/system/libsystem_kernel.dylib) ==73662== by 0x101EFB5EA: pthread_sigmask (in /usr/lib/system/libsystem_pthread.dylib) ==73662== by 0x1000D9F6D: llvm::sys::Process::SafelyCloseFileDescriptor(int) (in /Users/chrisl/Projects/circt/build/bin/firtool) ==73662== by 0x100072795: llvm::ErrorOr > > getFileAux(llvm::Twine const&, long long, unsigned long long, unsigned long long, bool, bool) (in /Users/chrisl/Projects/circt/build/bin/firtool) ==73662== by 0x100072573: llvm::MemoryBuffer::getFileOrSTDIN(llvm::Twine const&, long long, bool) (in /Users/chrisl/Projects/circt/build/bin/firtool) ==73662== by 0x100282C25: mlir::openInputFile(llvm::StringRef, std::__1::basic_string, std::__1::allocator >*) (in /Users/chrisl/Projects/circt/build/bin Differential Revision: https://reviews.llvm.org/D98830 --- llvm/lib/Support/Unix/Process.inc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc index c3d8f7001de3..1ea36aae63f2 100644 --- a/llvm/lib/Support/Unix/Process.inc +++ b/llvm/lib/Support/Unix/Process.inc @@ -236,11 +236,11 @@ std::error_code Process::FixupStandardFileDescriptors() { std::error_code Process::SafelyCloseFileDescriptor(int FD) { // Create a signal set filled with *all* signals. - sigset_t FullSet; - if (sigfillset(&FullSet) < 0) + sigset_t FullSet, SavedSet; + if (sigfillset(&FullSet) < 0 || sigfillset(&SavedSet) < 0) return std::error_code(errno, std::generic_category()); + // Atomically swap our current signal mask with a full mask. - sigset_t SavedSet; #if LLVM_ENABLE_THREADS if (int EC = pthread_sigmask(SIG_SETMASK, &FullSet, &SavedSet)) return std::error_code(EC, std::generic_category()); -- GitLab From c2f8e158f57c173298ac39db8fd44211604ed003 Mon Sep 17 00:00:00 2001 From: Mike Rice Date: Wed, 17 Mar 2021 16:43:47 -0700 Subject: [PATCH 0052/1000] [OPENMP51]Support for the 'destroy' clause with interop variable. Added basic parsing/sema/serialization support to extend the existing 'destroy' clause for use with the 'interop' directive. Differential Revision: https://reviews.llvm.org/D98834 --- clang/include/clang/AST/OpenMPClause.h | 52 ++++++++++++++++- clang/include/clang/AST/RecursiveASTVisitor.h | 3 +- clang/include/clang/Sema/Sema.h | 5 +- clang/lib/AST/OpenMPClause.cpp | 7 ++- clang/lib/AST/StmtProfile.cpp | 5 +- clang/lib/Parse/ParseOpenMP.cpp | 16 +++++- clang/lib/Sema/SemaOpenMP.cpp | 33 +++++++---- clang/lib/Sema/TreeTransform.h | 23 +++++++- clang/lib/Serialization/ASTReader.cpp | 6 +- clang/lib/Serialization/ASTWriter.cpp | 6 +- clang/test/OpenMP/interop_ast_print.cpp | 56 +++++++++++++++++++ clang/test/OpenMP/interop_messages.cpp | 26 +++++++++ clang/tools/libclang/CIndex.cpp | 5 +- llvm/include/llvm/Frontend/OpenMP/OMP.td | 1 + 14 files changed, 221 insertions(+), 23 deletions(-) diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index b342ffb93256..f71eb15feea2 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -7561,14 +7561,49 @@ public: }; /// This represents 'destroy' clause in the '#pragma omp depobj' -/// directive. +/// directive or the '#pragma omp interop' directive.. /// /// \code /// #pragma omp depobj(a) destroy +/// #pragma omp interop destroy(obj) /// \endcode -/// In this example directive '#pragma omp depobj' has 'destroy' clause. +/// In these examples directive '#pragma omp depobj' and '#pragma omp interop' +/// have a 'destroy' clause. The 'interop' directive includes an object. class OMPDestroyClause final : public OMPClause { + friend class OMPClauseReader; + + /// Location of '('. + SourceLocation LParenLoc; + + /// Location of interop variable. + SourceLocation VarLoc; + + /// The interop variable. + Stmt *InteropVar = nullptr; + + /// Set the interop variable. + void setInteropVar(Expr *E) { InteropVar = E; } + + /// Sets the location of '('. + void setLParenLoc(SourceLocation Loc) { LParenLoc = Loc; } + + /// Sets the location of the interop variable. + void setVarLoc(SourceLocation Loc) { VarLoc = Loc; } + public: + /// Build 'destroy' clause with an interop variable expression \a InteropVar. + /// + /// \param InteropVar The interop variable. + /// \param StartLoc Starting location of the clause. + /// \param LParenLoc Location of '('. + /// \param VarLoc Location of the interop variable. + /// \param EndLoc Ending location of the clause. + OMPDestroyClause(Expr *InteropVar, SourceLocation StartLoc, + SourceLocation LParenLoc, SourceLocation VarLoc, + SourceLocation EndLoc) + : OMPClause(llvm::omp::OMPC_destroy, StartLoc, EndLoc), + LParenLoc(LParenLoc), VarLoc(VarLoc), InteropVar(InteropVar) {} + /// Build 'destroy' clause. /// /// \param StartLoc Starting location of the clause. @@ -7581,11 +7616,24 @@ public: : OMPClause(llvm::omp::OMPC_destroy, SourceLocation(), SourceLocation()) { } + /// Returns the location of '('. + SourceLocation getLParenLoc() const { return LParenLoc; } + + /// Returns the location of the interop variable. + SourceLocation getVarLoc() const { return VarLoc; } + + /// Returns the interop variable. + Expr *getInteropVar() const { return cast_or_null(InteropVar); } + child_range children() { + if (InteropVar) + return child_range(&InteropVar, &InteropVar + 1); return child_range(child_iterator(), child_iterator()); } const_child_range children() const { + if (InteropVar) + return const_child_range(&InteropVar, &InteropVar + 1); return const_child_range(const_child_iterator(), const_child_iterator()); } diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 4a7c234e374b..256f73338bd2 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -3210,7 +3210,8 @@ bool RecursiveASTVisitor::VisitOMPUseClause(OMPUseClause *C) { } template -bool RecursiveASTVisitor::VisitOMPDestroyClause(OMPDestroyClause *) { +bool RecursiveASTVisitor::VisitOMPDestroyClause(OMPDestroyClause *C) { + TRY_TO(TraverseStmt(C->getInteropVar())); return true; } diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 978c5de57646..b144587650eb 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10998,8 +10998,11 @@ public: SourceLocation VarLoc, SourceLocation EndLoc); /// Called on well-formed 'destroy' clause. - OMPClause *ActOnOpenMPDestroyClause(SourceLocation StartLoc, + OMPClause *ActOnOpenMPDestroyClause(Expr *InteropVar, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation VarLoc, SourceLocation EndLoc); + /// Called on well-formed 'threads' clause. OMPClause *ActOnOpenMPThreadsClause(SourceLocation StartLoc, SourceLocation EndLoc); diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index 1014c8e3a95e..254b42606408 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -1807,8 +1807,13 @@ void OMPClausePrinter::VisitOMPUseClause(OMPUseClause *Node) { OS << ")"; } -void OMPClausePrinter::VisitOMPDestroyClause(OMPDestroyClause *) { +void OMPClausePrinter::VisitOMPDestroyClause(OMPDestroyClause *Node) { OS << "destroy"; + if (Expr *E = Node->getInteropVar()) { + OS << "("; + E->printPretty(OS, nullptr, Policy); + OS << ")"; + } } template diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index c1ffa069d267..bf130ed4ff3d 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -552,7 +552,10 @@ void OMPClauseProfiler::VisitOMPUseClause(const OMPUseClause *C) { Profiler->VisitStmt(C->getInteropVar()); } -void OMPClauseProfiler::VisitOMPDestroyClause(const OMPDestroyClause *) {} +void OMPClauseProfiler::VisitOMPDestroyClause(const OMPDestroyClause *C) { + if (C->getInteropVar()) + Profiler->VisitStmt(C->getInteropVar()); +} template void OMPClauseProfiler::VisitOMPClauseList(T *Node) { diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index fe7998f6bfc8..2e0104e3d348 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -2865,7 +2865,6 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, case OMPC_unified_shared_memory: case OMPC_reverse_offload: case OMPC_dynamic_allocators: - case OMPC_destroy: // OpenMP [2.7.1, Restrictions, p. 9] // Only one ordered clause can appear on a loop directive. // OpenMP [2.7.1, Restrictions, C/C++, p. 4] @@ -2929,6 +2928,17 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, case OMPC_uses_allocators: Clause = ParseOpenMPUsesAllocatorClause(DKind); break; + case OMPC_destroy: + if (DKind != OMPD_interop) { + if (!FirstClause) { + Diag(Tok, diag::err_omp_more_one_clause) + << getOpenMPDirectiveName(DKind) << getOpenMPClauseName(CKind) << 0; + ErrorFound = true; + } + Clause = ParseOpenMPClause(CKind, WrongDirective); + break; + } + LLVM_FALLTHROUGH; case OMPC_init: case OMPC_use: Clause = ParseOpenMPInteropClause(CKind, WrongDirective); @@ -3160,6 +3170,10 @@ OMPClause *Parser::ParseOpenMPInteropClause(OpenMPClauseKind Kind, return Actions.ActOnOpenMPUseClause(InteropVarExpr.get(), Loc, T.getOpenLocation(), VarLoc, RLoc); + if (Kind == OMPC_destroy) + return Actions.ActOnOpenMPDestroyClause(InteropVarExpr.get(), Loc, + T.getOpenLocation(), VarLoc, RLoc); + llvm_unreachable("Unexpected interop variable clause."); } diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index c4da3e58f58c..54c824c4a759 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -14441,7 +14441,9 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind, Res = ActOnOpenMPDynamicAllocatorsClause(StartLoc, EndLoc); break; case OMPC_destroy: - Res = ActOnOpenMPDestroyClause(StartLoc, EndLoc); + Res = ActOnOpenMPDestroyClause(/*InteropVar=*/nullptr, StartLoc, + /*LParenLoc=*/SourceLocation(), + /*VarLoc=*/SourceLocation(), EndLoc); break; case OMPC_if: case OMPC_final: @@ -14599,19 +14601,13 @@ OMPClause *Sema::ActOnOpenMPDynamicAllocatorsClause(SourceLocation StartLoc, return new (Context) OMPDynamicAllocatorsClause(StartLoc, EndLoc); } -OMPClause *Sema::ActOnOpenMPDestroyClause(SourceLocation StartLoc, - SourceLocation EndLoc) { - return new (Context) OMPDestroyClause(StartLoc, EndLoc); -} - StmtResult Sema::ActOnOpenMPInteropDirective(ArrayRef Clauses, SourceLocation StartLoc, SourceLocation EndLoc) { // OpenMP 5.1 [2.15.1, interop Construct, Restrictions] // At least one action-clause must appear on a directive. - // TODO: also add 'destroy' here. - if (!hasClauses(Clauses, OMPC_init, OMPC_use, OMPC_nowait)) { + if (!hasClauses(Clauses, OMPC_init, OMPC_use, OMPC_destroy, OMPC_nowait)) { StringRef Expected = "'init', 'use', 'destroy', or 'nowait'"; Diag(StartLoc, diag::err_omp_no_clause_for_directive) << Expected << getOpenMPDirectiveName(OMPD_interop); @@ -14662,8 +14658,11 @@ StmtResult Sema::ActOnOpenMPInteropDirective(ArrayRef Clauses, const auto *UC = cast(C); VarLoc = UC->getVarLoc(); DRE = dyn_cast_or_null(UC->getInteropVar()); + } else if (ClauseKind == OMPC_destroy) { + const auto *DC = cast(C); + VarLoc = DC->getVarLoc(); + DRE = dyn_cast_or_null(DC->getInteropVar()); } - // TODO: 'destroy' clause to be added here. if (!DRE) continue; @@ -14723,8 +14722,7 @@ static bool isValidInteropVariable(Sema &SemaRef, Expr *InteropVarExpr, // OpenMP 5.1 [2.15.1, interop Construct, Restrictions] // The interop-var passed to init or destroy must be non-const. - // TODO: 'destroy' clause too. - if (Kind == OMPC_init && + if ((Kind == OMPC_init || Kind == OMPC_destroy) && isConstNotMutableType(SemaRef, InteropVarExpr->getType())) { SemaRef.Diag(VarLoc, diag::err_omp_interop_variable_expected) << /*non-const*/ 1; @@ -14773,6 +14771,19 @@ OMPClause *Sema::ActOnOpenMPUseClause(Expr *InteropVar, SourceLocation StartLoc, OMPUseClause(InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc); } +OMPClause *Sema::ActOnOpenMPDestroyClause(Expr *InteropVar, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation VarLoc, + SourceLocation EndLoc) { + if (InteropVar && + !isValidInteropVariable(*this, InteropVar, VarLoc, OMPC_destroy)) + return nullptr; + + return new (Context) + OMPDestroyClause(InteropVar, StartLoc, LParenLoc, VarLoc, EndLoc); +} + OMPClause *Sema::ActOnOpenMPVarListClause( OpenMPClauseKind Kind, ArrayRef VarList, Expr *DepModOrTailExpr, const OMPVarListLocTy &Locs, SourceLocation ColonLoc, diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 7f6432d83821..5fb2bfa85352 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -2196,6 +2196,18 @@ public: VarLoc, EndLoc); } + /// Build a new OpenMP 'destroy' clause. + /// + /// By default, performs semantic analysis to build the new OpenMP clause. + /// Subclasses may override this routine to provide different behavior. + OMPClause *RebuildOMPDestroyClause(Expr *InteropVar, SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation VarLoc, + SourceLocation EndLoc) { + return getSema().ActOnOpenMPDestroyClause(InteropVar, StartLoc, LParenLoc, + VarLoc, EndLoc); + } + /// Rebuild the operand to an Objective-C \@synchronized statement. /// /// By default, performs semantic analysis to build the new statement. @@ -9343,8 +9355,15 @@ OMPClause *TreeTransform::TransformOMPUseClause(OMPUseClause *C) { template OMPClause * TreeTransform::TransformOMPDestroyClause(OMPDestroyClause *C) { - // No need to rebuild this clause, no template-dependent parameters. - return C; + ExprResult ER; + if (Expr *IV = C->getInteropVar()) { + ER = getDerived().TransformExpr(IV); + if (ER.isInvalid()) + return nullptr; + } + return getDerived().RebuildOMPDestroyClause(ER.get(), C->getBeginLoc(), + C->getLParenLoc(), C->getVarLoc(), + C->getEndLoc()); } template diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index 5dd30017113c..a76bda15076b 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -12156,7 +12156,11 @@ void OMPClauseReader::VisitOMPUseClause(OMPUseClause *C) { C->setVarLoc(Record.readSourceLocation()); } -void OMPClauseReader::VisitOMPDestroyClause(OMPDestroyClause *) {} +void OMPClauseReader::VisitOMPDestroyClause(OMPDestroyClause *C) { + C->setInteropVar(Record.readSubExpr()); + C->setLParenLoc(Record.readSourceLocation()); + C->setVarLoc(Record.readSourceLocation()); +} void OMPClauseReader::VisitOMPUnifiedAddressClause(OMPUnifiedAddressClause *) {} diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index 11deaf65254f..18decd9e6bc1 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -6231,7 +6231,11 @@ void OMPClauseWriter::VisitOMPUseClause(OMPUseClause *C) { Record.AddSourceLocation(C->getVarLoc()); } -void OMPClauseWriter::VisitOMPDestroyClause(OMPDestroyClause *) {} +void OMPClauseWriter::VisitOMPDestroyClause(OMPDestroyClause *C) { + Record.AddStmt(C->getInteropVar()); + Record.AddSourceLocation(C->getLParenLoc()); + Record.AddSourceLocation(C->getVarLoc()); +} void OMPClauseWriter::VisitOMPPrivateClause(OMPPrivateClause *C) { Record.push_back(C->varlist_size()); diff --git a/clang/test/OpenMP/interop_ast_print.cpp b/clang/test/OpenMP/interop_ast_print.cpp index 24d95268c653..8f8ddc839c72 100644 --- a/clang/test/OpenMP/interop_ast_print.cpp +++ b/clang/test/OpenMP/interop_ast_print.cpp @@ -41,6 +41,12 @@ void foo1(int *ap, int dev) { //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'I' #pragma omp interop use(I) + //PRINT: #pragma omp interop destroy(I) + //DUMP: OMPInteropDirective + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'I' + #pragma omp interop destroy(I) + //PRINT: #pragma omp interop init(target : IRef) //DUMP: OMPInteropDirective //DUMP: OMPInitClause @@ -53,6 +59,12 @@ void foo1(int *ap, int dev) { //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'IRef' #pragma omp interop use(IRef) + //PRINT: #pragma omp interop destroy(IRef) + //DUMP: OMPInteropDirective + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'IRef' + #pragma omp interop destroy(IRef) + const omp_interop_t CI = (omp_interop_t)0; //PRINT: #pragma omp interop use(CI) //DUMP: OMPInteropDirective @@ -80,6 +92,16 @@ void foo1(int *ap, int dev) { //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'I' #pragma omp interop device(dev) depend(inout:ap) use(I) + //PRINT: #pragma omp interop device(dev) depend(inout : ap) destroy(I) + //DUMP: OMPInteropDirective + //DUMP: OMPDeviceClause + //DUMP: DeclRefExpr{{.*}}'dev' 'int' + //DUMP: OMPDependClause + //DUMP: DeclRefExpr{{.*}}'ap' 'int *' + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'I' + #pragma omp interop device(dev) depend(inout:ap) destroy(I) + //PRINT: #pragma omp interop init(prefer_type(1,2,3,4,5,6), targetsync : I) //DUMP: OMPInteropDirective //DUMP: OMPInitClause @@ -150,6 +172,30 @@ void foo1(int *ap, int dev) { //DUMP: OMPUseClause //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'J' #pragma omp interop use(I) use(J) + + //PRINT: #pragma omp interop destroy(I) destroy(J) + //DUMP: OMPInteropDirective + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'I' + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'J' + #pragma omp interop destroy(I) destroy(J) + + //PRINT: #pragma omp interop init(target : I) destroy(J) + //DUMP: OMPInteropDirective + //DUMP: OMPInitClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'I' + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'J' + #pragma omp interop init(target:I) destroy(J) + + //PRINT: #pragma omp interop destroy(I) use(J) + //DUMP: OMPInteropDirective + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'I' + //DUMP: OMPUseClause + //DUMP: DeclRefExpr{{.*}}'omp_interop_t'{{.*}}Var{{.*}}'J' + #pragma omp interop destroy(I) use(J) } //DUMP: FunctionTemplateDecl{{.*}}fooTemp @@ -200,6 +246,12 @@ void barTemp(T t) { //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'t' 'T' #pragma omp interop use(t) + //PRINT: #pragma omp interop destroy(t) + //DUMP: OMPInteropDirective + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'t' 'T' + #pragma omp interop destroy(t) + //DUMP: FunctionDecl{{.*}}barTemp 'void (void *)' //DUMP: TemplateArgument type 'void *' //DUMP: ParmVarDecl{{.*}}t 'void *' @@ -211,6 +263,10 @@ void barTemp(T t) { //DUMP: OMPUseClause //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'t' 'void *' //PRINT: #pragma omp interop use(t) + //DUMP: OMPInteropDirective + //DUMP: OMPDestroyClause + //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'t' 'void *' + //PRINT: #pragma omp interop destroy(t) } void bar() diff --git a/clang/test/OpenMP/interop_messages.cpp b/clang/test/OpenMP/interop_messages.cpp index 550cf81b5370..50f1efb5a6a9 100644 --- a/clang/test/OpenMP/interop_messages.cpp +++ b/clang/test/OpenMP/interop_messages.cpp @@ -17,6 +17,9 @@ void foo(int *Ap) { //expected-error@+1 {{use of undeclared identifier 'NoDeclVar'}} #pragma omp interop use(NoDeclVar) use(Another) + //expected-error@+1 {{use of undeclared identifier 'NoDeclVar'}} + #pragma omp interop destroy(NoDeclVar) destroy(Another) + //expected-error@+2 {{expected interop type: 'target' and/or 'targetsync'}} //expected-error@+1 {{expected expression}} #pragma omp interop init(InteropVar) init(target:Another) @@ -38,6 +41,9 @@ void foo(int *Ap) { //expected-error@+1 {{interop variable must be of type 'omp_interop_t'}} #pragma omp interop use(IntVar) use(Another) + //expected-error@+1 {{interop variable must be of type 'omp_interop_t'}} + #pragma omp interop destroy(IntVar) destroy(Another) + //expected-error@+1 {{interop variable must be of type 'omp_interop_t'}} #pragma omp interop init(prefer_type(1,"sycl",3),target:SVar) \ init(target:Another) @@ -45,6 +51,9 @@ void foo(int *Ap) { //expected-error@+1 {{interop variable must be of type 'omp_interop_t'}} #pragma omp interop use(SVar) use(Another) + //expected-error@+1 {{interop variable must be of type 'omp_interop_t'}} + #pragma omp interop destroy(SVar) destroy(Another) + int a, b; //expected-error@+1 {{expected variable of type 'omp_interop_t'}} #pragma omp interop init(target:a+b) init(target:Another) @@ -52,10 +61,16 @@ void foo(int *Ap) { //expected-error@+1 {{expected variable of type 'omp_interop_t'}} #pragma omp interop use(a+b) use(Another) + //expected-error@+1 {{expected variable of type 'omp_interop_t'}} + #pragma omp interop destroy(a+b) destroy(Another) + const omp_interop_t C = (omp_interop_t)5; //expected-error@+1 {{expected non-const variable of type 'omp_interop_t'}} #pragma omp interop init(target:C) init(target:Another) + //expected-error@+1 {{expected non-const variable of type 'omp_interop_t'}} + #pragma omp interop destroy(C) destroy(Another) + //expected-error@+1 {{prefer_list item must be a string literal or constant integral expression}} #pragma omp interop init(prefer_type(1.0),target:InteropVar) \ init(target:Another) @@ -79,9 +94,18 @@ void foo(int *Ap) { //expected-error@+1 {{interop variable 'InteropVar' used in multiple action clauses}} #pragma omp interop use(InteropVar) use(InteropVar) + //expected-error@+1 {{interop variable 'InteropVar' used in multiple action clauses}} + #pragma omp interop destroy(InteropVar) destroy(InteropVar) + //expected-error@+1 {{interop variable 'InteropVar' used in multiple action clauses}} #pragma omp interop init(target:InteropVar) use(InteropVar) + //expected-error@+1 {{interop variable 'InteropVar' used in multiple action clauses}} + #pragma omp interop init(target:InteropVar) destroy(InteropVar) + + //expected-error@+1 {{interop variable 'InteropVar' used in multiple action clauses}} + #pragma omp interop use(InteropVar) destroy(InteropVar) + //expected-error@+1 {{directive '#pragma omp interop' cannot contain more than one 'device' clause}} #pragma omp interop init(target:InteropVar) device(0) device(1) @@ -99,5 +123,7 @@ void foo() { #pragma omp interop init(prefer_type(1,"sycl",3),target:InteropVar) nowait //expected-error@+1 {{'omp_interop_t' type not found; include }} #pragma omp interop use(InteropVar) nowait + //expected-error@+1 {{'omp_interop_t' type not found; include }} + #pragma omp interop destroy(InteropVar) nowait } #endif diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index 235f5db2bfee..841b36a6036c 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2286,7 +2286,10 @@ void OMPClauseEnqueue::VisitOMPUseClause(const OMPUseClause *C) { Visitor->AddStmt(C->getInteropVar()); } -void OMPClauseEnqueue::VisitOMPDestroyClause(const OMPDestroyClause *) {} +void OMPClauseEnqueue::VisitOMPDestroyClause(const OMPDestroyClause *C) { + if (C->getInteropVar()) + Visitor->AddStmt(C->getInteropVar()); +} void OMPClauseEnqueue::VisitOMPUnifiedAddressClause( const OMPUnifiedAddressClause *) {} diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td index 685732140eee..abd636c07e9c 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMP.td +++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td @@ -1650,6 +1650,7 @@ def OMP_interop : Directive<"interop"> { let allowedClauses = [ VersionedClause, VersionedClause, + VersionedClause, VersionedClause, VersionedClause, VersionedClause, -- GitLab From 4b1c8070bb8c3d59f746c4daa16f27547cd71b86 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Thu, 18 Mar 2021 09:11:28 -0700 Subject: [PATCH 0053/1000] [NFC][ArgumentPromotion] Clear FAM cached results of erased function. Not doing it here can lead to subtle bugs - the analysis results are associated by the Function object's address. Nothing stops the memory allocator from allocating new functions at the same address. --- llvm/lib/Transforms/IPO/ArgumentPromotion.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp index fe5cd7671213..5f24d53da0b3 100644 --- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -1052,6 +1052,7 @@ PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C, // swaps out the particular function mapped to a particular node in the // graph. C.getOuterRefSCC().replaceNodeFunction(N, *NewF); + FAM.clear(OldF, OldF.getName()); OldF.eraseFromParent(); } -- GitLab From 6dad34454d4147b401dce668379b88acb748b789 Mon Sep 17 00:00:00 2001 From: Ricky Taylor Date: Thu, 18 Mar 2021 16:29:08 +0000 Subject: [PATCH 0054/1000] Test commit This is a test commit to verify my access. -- GitLab From 92ccc6cb17a4fd1b9506bac51f2eb1a96f4cd345 Mon Sep 17 00:00:00 2001 From: Mircea Trofin Date: Fri, 12 Mar 2021 08:21:14 -0800 Subject: [PATCH 0055/1000] Reapply "[NPM][CGSCC] FunctionAnalysisManagerCGSCCProxy: do not clear immutable function passes" This reverts commit 11b70b9e3a7458b5b78c30020b56e8ca563a4801. The bot failure was due to ArgumentPromotion deleting functions without deleting their analyses. This was separately fixed in 4b1c807. --- .../test/CodeGen/thinlto-distributed-newpm.ll | 14 +++-------- llvm/lib/Analysis/CGSCCPassManager.cpp | 2 +- .../Analysis/CGSCCPassManagerTest.cpp | 25 +++++++++++++++++++ 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/clang/test/CodeGen/thinlto-distributed-newpm.ll b/clang/test/CodeGen/thinlto-distributed-newpm.ll index 867203417754..1e9d5d4d2629 100644 --- a/clang/test/CodeGen/thinlto-distributed-newpm.ll +++ b/clang/test/CodeGen/thinlto-distributed-newpm.ll @@ -12,7 +12,7 @@ ; RUN: %clang -target x86_64-grtev4-linux-gnu \ ; RUN: -O2 -fexperimental-new-pass-manager -Xclang -fdebug-pass-manager \ ; RUN: -c -fthinlto-index=%t.o.thinlto.bc \ -; RUN: -o %t.native.o -x ir %t.o 2>&1 | FileCheck -check-prefixes=CHECK-O,CHECK-O2 %s --dump-input=fail +; RUN: -o %t.native.o -x ir %t.o 2>&1 | FileCheck -check-prefix=CHECK-O %s --dump-input=fail ; RUN: %clang -target x86_64-grtev4-linux-gnu \ ; RUN: -O3 -fexperimental-new-pass-manager -Xclang -fdebug-pass-manager \ @@ -70,24 +70,19 @@ ; CHECK-O: Starting CGSCC pass manager run. ; CHECK-O: Running pass: InlinerPass on (main) ; CHECK-O: Running pass: PostOrderFunctionAttrsPass on (main) -; CHECK-O: Clearing all analysis results for: main +; CHECK-O: Invalidating analysis: DominatorTreeAnalysis on main +; CHECK-O: Invalidating analysis: BasicAA on main +; CHECK-O: Invalidating analysis: AAManager on main ; CHECK-O3: Running pass: ArgumentPromotionPass on (main) -; CHECK-O3: Running analysis: TargetIRAnalysis on main ; CHECK-O: Starting {{.*}}Function pass manager run. ; CHECK-O: Running pass: SROA on main ; These next two can appear in any order since they are accessed as parameters ; on the same call to SROA::runImpl ; CHECK-O-DAG: Running analysis: DominatorTreeAnalysis on main -; CHECK-O-DAG: Running analysis: AssumptionAnalysis on main ; CHECK-O: Running pass: EarlyCSEPass on main -; CHECK-O: Running analysis: TargetLibraryAnalysis on main -; CHECK-O2: Running analysis: TargetIRAnalysis on main ; CHECK-O: Running analysis: MemorySSAAnalysis on main ; CHECK-O: Running analysis: AAManager on main ; CHECK-O: Running analysis: BasicAA on main -; CHECK-O: Running analysis: ScopedNoAliasAA on main -; CHECK-O: Running analysis: TypeBasedAA on main -; CHECK-O: Running analysis: OuterAnalysisManagerProxy ; CHECK-O: Running pass: SpeculativeExecutionPass on main ; CHECK-O: Running pass: JumpThreadingPass on main ; CHECK-O: Running analysis: LazyValueAnalysis on main @@ -96,7 +91,6 @@ ; CHECK-O: Running pass: SimplifyCFGPass on main ; CHECK-O3: Running pass: AggressiveInstCombinePass on main ; CHECK-O: Running pass: InstCombinePass on main -; CHECK-O: Running analysis: OptimizationRemarkEmitterAnalysis on main ; CHECK-O: Running pass: LibCallsShrinkWrapPass on main ; CHECK-O: Running pass: TailCallElimPass on main ; CHECK-O: Running pass: SimplifyCFGPass on main diff --git a/llvm/lib/Analysis/CGSCCPassManager.cpp b/llvm/lib/Analysis/CGSCCPassManager.cpp index 9dc62b877ae2..eaaa3d09a7f2 100644 --- a/llvm/lib/Analysis/CGSCCPassManager.cpp +++ b/llvm/lib/Analysis/CGSCCPassManager.cpp @@ -720,7 +720,7 @@ bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate( auto PAC = PA.getChecker(); if (!PAC.preserved() && !PAC.preservedSet>()) { for (LazyCallGraph::Node &N : C) - FAM->clear(N.getFunction(), N.getFunction().getName()); + FAM->invalidate(N.getFunction(), PA); return false; } diff --git a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp index 59ff97d0fc1a..ceaeaaf83e5d 100644 --- a/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp +++ b/llvm/unittests/Analysis/CGSCCPassManagerTest.cpp @@ -1942,5 +1942,30 @@ TEST_F(CGSCCPassManagerTest, TestInsertionOfNewNonTrivialCallEdge) { ASSERT_TRUE(Ran); } +TEST_F(CGSCCPassManagerTest, TestFunctionPassesAreQueriedForInvalidation) { + std::unique_ptr M = parseIR("define void @f() { ret void }"); + CGSCCPassManager CGPM; + bool SCCCalled = false; + FunctionPassManager FPM; + int ImmRuns = 0; + FAM.registerPass([&] { return TestImmutableFunctionAnalysis(ImmRuns); }); + FPM.addPass(RequireAnalysisPass()); + CGPM.addPass( + LambdaSCCPass([&](LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, + LazyCallGraph &CG, CGSCCUpdateResult &UR) { + SCCCalled = true; + return PreservedAnalyses::none(); + })); + CGPM.addPass(createCGSCCToFunctionPassAdaptor( + RequireAnalysisPass())); + ModulePassManager MPM; + + MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); + MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM))); + MPM.run(*M, MAM); + ASSERT_EQ(ImmRuns, 1); + ASSERT_TRUE(SCCCalled); +} + #endif } // namespace -- GitLab From 14756b70eeba76c0adeb73b82c4e69b35b74cdbe Mon Sep 17 00:00:00 2001 From: Wei Mi Date: Wed, 17 Mar 2021 17:51:27 -0700 Subject: [PATCH 0056/1000] [SampleFDO] Don't mix up the existing indirect call value profile with the new value profile annotated after inlining. In https://reviews.llvm.org/D96806 and https://reviews.llvm.org/D97350, we use the magic number -1 in the value profile to avoid repeated indirect call promotion to the same target for an indirect call. Function updateIDTMetaData is used to mark an target as being promoted in the value profile with the magic number. updateIDTMetaData is also used to update the value profile when an indirect call is inlined and new inline instance profile should be applied. For the second case, currently updateIDTMetaData mixes up the existing value profile of the indirect call with the new profile, leading to the problematic senario that a target count is larger than the total count in the value profile. The patch fixes the problem. When updateIDTMetaData is used to update the value profile after inlining, all the values in the existing value profile will be dropped except the values with the magic number counts. Differential Revision: https://reviews.llvm.org/D98835 --- llvm/lib/Transforms/IPO/SampleProfile.cpp | 67 ++++++++--------- .../Inputs/norepeated-icp-3.prof | 6 ++ .../SampleProfile/norepeated-icp-3.ll | 71 +++++++++++++++++++ 3 files changed, 112 insertions(+), 32 deletions(-) create mode 100644 llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp-3.prof create mode 100644 llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp index 2ecff87f492f..561165aea9b8 100644 --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -755,14 +755,8 @@ static void updateIDTMetaData(Instruction &Inst, const SmallVectorImpl &CallTargets, uint64_t Sum) { - assert((Sum != 0 || (CallTargets.size() == 1 && - CallTargets[0].Count == NOMORE_ICP_MAGICNUM)) && - "If sum is 0, assume only one element in CallTargets with count " - "being NOMORE_ICP_MAGICNUM"); - uint32_t NumVals = 0; // OldSum is the existing total count in the value profile data. - // It will be replaced by Sum if Sum is not 0. uint64_t OldSum = 0; std::unique_ptr ValueData = std::make_unique(MaxNumPromotions); @@ -771,34 +765,44 @@ updateIDTMetaData(Instruction &Inst, ValueData.get(), NumVals, OldSum, true); DenseMap ValueCountMap; - // Initialize ValueCountMap with existing value profile data. - if (Valid) { - for (uint32_t I = 0; I < NumVals; I++) - ValueCountMap[ValueData[I].Value] = ValueData[I].Count; - } - - for (const auto &Data : CallTargets) { - auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count); - if (Pair.second) - continue; - // Whenever the count is NOMORE_ICP_MAGICNUM for a value, keep it - // in the ValueCountMap. If both the count in CallTargets and the - // count in ValueCountMap is not NOMORE_ICP_MAGICNUM, keep the - // count in CallTargets. - if (Pair.first->second != NOMORE_ICP_MAGICNUM && - Data.Count == NOMORE_ICP_MAGICNUM) { + if (Sum == 0) { + assert((CallTargets.size() == 1 && + CallTargets[0].Count == NOMORE_ICP_MAGICNUM) && + "If sum is 0, assume only one element in CallTargets " + "with count being NOMORE_ICP_MAGICNUM"); + // Initialize ValueCountMap with existing value profile data. + if (Valid) { + for (uint32_t I = 0; I < NumVals; I++) + ValueCountMap[ValueData[I].Value] = ValueData[I].Count; + } + auto Pair = + ValueCountMap.try_emplace(CallTargets[0].Value, CallTargets[0].Count); + // If the target already exists in value profile, decrease the total + // count OldSum and reset the target's count to NOMORE_ICP_MAGICNUM. + if (!Pair.second) { OldSum -= Pair.first->second; Pair.first->second = NOMORE_ICP_MAGICNUM; - } else if (Pair.first->second == NOMORE_ICP_MAGICNUM && - Data.Count != NOMORE_ICP_MAGICNUM) { + } + Sum = OldSum; + } else { + // Initialize ValueCountMap with existing NOMORE_ICP_MAGICNUM + // counts in the value profile. + if (Valid) { + for (uint32_t I = 0; I < NumVals; I++) { + if (ValueData[I].Count == NOMORE_ICP_MAGICNUM) + ValueCountMap[ValueData[I].Value] = ValueData[I].Count; + } + } + + for (const auto &Data : CallTargets) { + auto Pair = ValueCountMap.try_emplace(Data.Value, Data.Count); + if (Pair.second) + continue; + // The target represented by Data.Value has already been promoted. + // Keep the count as NOMORE_ICP_MAGICNUM in the profile and decrease + // Sum by Data.Count. assert(Sum >= Data.Count && "Sum should never be less than Data.Count"); Sum -= Data.Count; - } else if (Pair.first->second != NOMORE_ICP_MAGICNUM && - Data.Count != NOMORE_ICP_MAGICNUM) { - // Sum will be used in this case. Although the existing count - // for the current value in value profile will be overriden, - // no need to update OldSum. - Pair.first->second = Data.Count; } } @@ -818,8 +822,7 @@ updateIDTMetaData(Instruction &Inst, uint32_t MaxMDCount = std::min(NewCallTargets.size(), static_cast(MaxNumPromotions)); annotateValueSite(*Inst.getParent()->getParent()->getParent(), Inst, - NewCallTargets, Sum ? Sum : OldSum, IPVK_IndirectCallTarget, - MaxMDCount); + NewCallTargets, Sum, IPVK_IndirectCallTarget, MaxMDCount); } /// Attempt to promote indirect call and also inline the promoted call. diff --git a/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp-3.prof b/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp-3.prof new file mode 100644 index 000000000000..a65c792bf070 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/norepeated-icp-3.prof @@ -0,0 +1,6 @@ +_Z3foov:225715:1 + 2: 5553 + 3: 5391 + 1: _Z3goov:5860 + 1: 5279 _Z3hoov:5860 _Z3moov:210 + 2: 5279 diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll new file mode 100644 index 000000000000..140a15f58747 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll @@ -0,0 +1,71 @@ +; RUN: opt < %s -passes=sample-profile -sample-profile-icp-max-prom=4 -sample-profile-file=%S/Inputs/norepeated-icp-3.prof -S | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@.str = private unnamed_addr constant [5 x i8] c"hoo\0A\00", align 1 +@p = dso_local global void ()* null, align 8 +@str = private unnamed_addr constant [4 x i8] c"hoo\00", align 1 + +; Function Attrs: nofree nounwind +declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) #1 + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3goov() #0 !dbg !11 { +entry: + %0 = load void ()*, void ()** @p, align 8, !dbg !12, !tbaa !13 + call void %0(), !dbg !17, !prof !22 + ret void, !dbg !18 +} + +; After the indirect call in _Z3goov is inlined into _Z3foov, it will be +; annotated with new inline instance profile. The existing value profile +; associated with the indirect call should be dropped except those values +; wth NOMORE_ICP_MAGICNUM magic number indicating promoted targets. +; CHECK-LABEL: @_Z3foov( +; CHECK: call void %0(), {{.*}} !prof ![[PROF_ID:[0-9]+]] +; CHECK-NEXT: ret void + +; Function Attrs: uwtable mustprogress +define dso_local void @_Z3foov() #0 !dbg !19 { +entry: + call void @_Z3goov(), !dbg !20 + ret void, !dbg !21 +} + +; Function Attrs: nofree nounwind +declare noundef i32 @puts(i8* nocapture noundef readonly) #2 + +attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" } +attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nofree nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!1 = !DIFile(filename: "1.cc", directory: "") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!""} +!8 = !DISubroutineType(types: !2) +!11 = distinct !DISubprogram(name: "goo", linkageName: "_Z3goov", scope: !1, file: !1, line: 6, type: !8, scopeLine: 6, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!12 = !DILocation(line: 7, column: 5, scope: !11) +!13 = !{!14, !14, i64 0} +!14 = !{!"any pointer", !15, i64 0} +!15 = !{!"omnipotent char", !16, i64 0} +!16 = !{!"Simple C++ TBAA"} +!17 = !DILocation(line: 7, column: 3, scope: !11) +!18 = !DILocation(line: 8, column: 1, scope: !11) +!19 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 10, type: !8, scopeLine: 10, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2) +!20 = !DILocation(line: 11, column: 3, scope: !19) +!21 = !DILocation(line: 12, column: 3, scope: !19) +; The original value 125292384912345234234 and its count 8000 should +; be dropped when the indirect call is annotated with new profile. +; The original value -7383239051784516332 and its count -1 should be kept +; because -1 is NOMORE_ICP_MAGICNUM. +; CHECK: ![[PROF_ID]] = !{!"VP", i32 0, i64 5860, i64 -7383239051784516332, i64 -1, i64 -7701940972712279918, i64 5860} +!22 = !{!"VP", i32 0, i64 8000, i64 -7383239051784516332, i64 -1, i64 125292384912345234234, i64 8000} -- GitLab From 961e4384f4e938b901490912813ff0e8347cc3c0 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Tue, 16 Mar 2021 11:57:45 -0700 Subject: [PATCH 0057/1000] [AMDGPU] Support SCC on buffer atomics Differential Revision: https://reviews.llvm.org/D98731 --- llvm/lib/Target/AMDGPU/BUFInstructions.td | 23 +++++++++++-------- llvm/test/MC/AMDGPU/gfx90a_asm_features.s | 4 ++++ llvm/test/MC/AMDGPU/gfx90a_err.s | 15 ++++++++++++ .../AMDGPU/gfx90a_dasm_features.txt | 3 +++ 4 files changed, 35 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index 6a760bac311b..d367969702e3 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -679,7 +679,7 @@ class MUBUF_Atomic_Pseudo; // GFX8, GFX9 (VI). //===----------------------------------------------------------------------===// -class MUBUF_Real_Base_vi op, MUBUF_Pseudo ps, int Enc> : +class MUBUF_Real_Base_vi op, MUBUF_Pseudo ps, int Enc, + bit has_sccb = ps.has_sccb> : MUBUF_Real, Enc64, SIMCInstr, @@ -2270,7 +2271,7 @@ class MUBUF_Real_Base_vi op, MUBUF_Pseudo ps, int Enc> : let Inst{12} = ps.offen; let Inst{13} = ps.idxen; let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, ps.glc_value); - let Inst{15} = !if(ps.has_sccb, cpol{CPolBit.SCC}, ps.sccb_value); + let Inst{15} = !if(has_sccb, cpol{CPolBit.SCC}, ps.sccb_value); let Inst{16} = ps.lds; let Inst{17} = !if(ps.has_slc, cpol{CPolBit.SLC}, ?); let Inst{24-18} = op; @@ -2281,26 +2282,28 @@ class MUBUF_Real_Base_vi op, MUBUF_Pseudo ps, int Enc> : let Inst{63-56} = !if(ps.has_soffset, soffset, ?); } -class MUBUF_Real_vi op, MUBUF_Pseudo ps> : - MUBUF_Real_Base_vi { +class MUBUF_Real_vi op, MUBUF_Pseudo ps, bit has_sccb = ps.has_sccb> : + MUBUF_Real_Base_vi { let AssemblerPredicate = isGFX8GFX9NotGFX90A; let DecoderNamespace = "GFX8"; let Inst{55} = !if(ps.has_tfe, tfe, ?); } -class MUBUF_Real_gfx90a op, MUBUF_Pseudo ps> : - MUBUF_Real_Base_vi { +class MUBUF_Real_gfx90a op, MUBUF_Pseudo ps, + bit has_sccb = ps.has_sccb> : + MUBUF_Real_Base_vi { let AssemblerPredicate = isGFX90APlus; let DecoderNamespace = "GFX90A"; - let AsmString = ps.Mnemonic # !subst("$tfe", "", ps.AsmOperands); + let AsmString = ps.Mnemonic # !subst("$sccb", !if(has_sccb, "$sccb",""), + !subst("$tfe", "", ps.AsmOperands)); let Inst{55} = acc; } multiclass MUBUF_Real_vi_gfx90a op, MUBUF_Pseudo ps> { def _vi : MUBUF_Real_vi; - def _gfx90a : MUBUF_Real_gfx90a; + def _gfx90a : MUBUF_Real_gfx90a; } multiclass MUBUF_Real_AllAddr_vi op> { @@ -2483,7 +2486,7 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>; } // End SubtargetPredicate = HasAtomicFaddInsts -let SubtargetPredicate = isGFX90APlus, AssemblerPredicate = isGFX90APlus in { +let SubtargetPredicate = isGFX90APlus in { defm BUFFER_ATOMIC_ADD_F64 : MUBUF_Real_Atomic_vi<0x4f>; defm BUFFER_ATOMIC_MIN_F64 : MUBUF_Real_Atomic_vi<0x50>; defm BUFFER_ATOMIC_MAX_F64 : MUBUF_Real_Atomic_vi<0x51>; diff --git a/llvm/test/MC/AMDGPU/gfx90a_asm_features.s b/llvm/test/MC/AMDGPU/gfx90a_asm_features.s index 38fa212175d8..fac42fd900ce 100644 --- a/llvm/test/MC/AMDGPU/gfx90a_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx90a_asm_features.s @@ -1046,3 +1046,7 @@ global_atomic_add_f32 v1, v0, v2, s[0:1] glc ; encoding: [0x00,0x80,0x35,0xdd,0x // GFX1010: error: instruction not supported on this GPU // GFX90A: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; encoding: [0x00,0x80,0x39,0xdd,0x00,0x02,0x7f,0x00] global_atomic_pk_add_f16 v0, v[0:1], v2, off glc + +// NOT-GFX90A: error: scc modifier is not supported on this GPU +// GFX90A: buffer_atomic_add v4, off, s[8:11], s3 scc ; encoding: [0x00,0x80,0x08,0xe1,0x00,0x04,0x02,0x03] +buffer_atomic_add v4, off, s[8:11], s3 scc diff --git a/llvm/test/MC/AMDGPU/gfx90a_err.s b/llvm/test/MC/AMDGPU/gfx90a_err.s index 15df69b05a17..44c48595ca17 100644 --- a/llvm/test/MC/AMDGPU/gfx90a_err.s +++ b/llvm/test/MC/AMDGPU/gfx90a_err.s @@ -231,6 +231,21 @@ global_atomic_min_f64 v[0:1], v[2:3], off scc global_atomic_max_f64 v[0:1], v[2:3], off scc // GFX90A: error: instruction must not use scc +buffer_atomic_add_f32 v4, off, s[8:11], s3 scc +// GFX90A: error: instruction must not use scc + +buffer_atomic_pk_add_f16 v4, off, s[8:11], s3 scc +// GFX90A: error: instruction must not use scc + +buffer_atomic_add_f64 v[4:5], off, s[8:11], s3 scc +// GFX90A: error: instruction must not use scc + +buffer_atomic_max_f64 v[4:5], off, s[8:11], s3 scc +// GFX90A: error: instruction must not use scc + +buffer_atomic_min_f64 v[4:5], off, s[8:11], s3 scc +// GFX90A: error: instruction must not use scc + v_mov_b32_sdwa v1, src_lds_direct dst_sel:DWORD // GFX90A: error: lds_direct is not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt index cc007a6cd4ca..bc5c6509d738 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx90a_dasm_features.txt @@ -793,3 +793,6 @@ # GFX90A: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc ; encoding: [0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00] 0x00,0x00,0x41,0xdd,0x00,0x02,0x00,0x00 + +# GFX90A: buffer_atomic_add v4, off, s[8:11], s3 scc ; encoding: [0x00,0x80,0x08,0xe1,0x00,0x04,0x02,0x03] +0x00,0x80,0x08,0xe1,0x00,0x04,0x02,0x03 -- GitLab From 626a31de15212a0e0c25df8435753cb9a0684668 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 18 Mar 2021 17:00:41 +0000 Subject: [PATCH 0058/1000] [libomptarget] Add register usage info to kernel metadata Add register usage information to the runtime metadata so that it can be used during kernel launch (that change will be in a different commit). Add this information to the kernel trace. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D98829 --- .../plugins/amdgpu/impl/internal.h | 4 +++ .../plugins/amdgpu/impl/system.cpp | 26 ++++++++++++++++++- .../libomptarget/plugins/amdgpu/src/rtl.cpp | 20 +++++++++++--- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/openmp/libomptarget/plugins/amdgpu/impl/internal.h b/openmp/libomptarget/plugins/amdgpu/impl/internal.h index 1b1d69328785..8ca66a9d478e 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/internal.h +++ b/openmp/libomptarget/plugins/amdgpu/impl/internal.h @@ -97,6 +97,10 @@ typedef struct atl_kernel_info_s { uint64_t kernel_object; uint32_t group_segment_size; uint32_t private_segment_size; + uint32_t sgpr_count; + uint32_t vgpr_count; + uint32_t sgpr_spill_count; + uint32_t vgpr_spill_count; uint32_t kernel_segment_size; uint32_t num_args; std::vector arg_alignments; diff --git a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp index da152b4045d1..d6cde1f699c2 100644 --- a/openmp/libomptarget/plugins/amdgpu/impl/system.cpp +++ b/openmp/libomptarget/plugins/amdgpu/impl/system.cpp @@ -832,7 +832,31 @@ static hsa_status_t get_code_object_custom_metadata(void *binary, msgpack_errors += map_lookup_string(element, ".symbol", &symbolName); msgpackErrorCheck(strings lookup in kernel metadata, msgpack_errors); - atl_kernel_info_t info = {0, 0, 0, 0, 0, {}, {}, {}}; + atl_kernel_info_t info = {0, 0, 0, 0, 0, 0, 0, 0, 0, {}, {}, {}}; + + uint64_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; + msgpack_errors += map_lookup_uint64_t(element, ".sgpr_count", &sgpr_count); + msgpackErrorCheck(sgpr count metadata lookup in kernel metadata, + msgpack_errors); + info.sgpr_count = sgpr_count; + + msgpack_errors += map_lookup_uint64_t(element, ".vgpr_count", &vgpr_count); + msgpackErrorCheck(vgpr count metadata lookup in kernel metadata, + msgpack_errors); + info.vgpr_count = vgpr_count; + + msgpack_errors += + map_lookup_uint64_t(element, ".sgpr_spill_count", &sgpr_spill_count); + msgpackErrorCheck(sgpr spill count metadata lookup in kernel metadata, + msgpack_errors); + info.sgpr_spill_count = sgpr_spill_count; + + msgpack_errors += + map_lookup_uint64_t(element, ".vgpr_spill_count", &vgpr_spill_count); + msgpackErrorCheck(vgpr spill count metadata lookup in kernel metadata, + msgpack_errors); + info.vgpr_spill_count = vgpr_spill_count; + size_t kernel_explicit_args_size = 0; uint64_t kernel_segment_size; msgpack_errors += map_lookup_uint64_t(element, ".kernarg_segment_size", diff --git a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp index 0e8df9e9ca60..a6b426dc0557 100644 --- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp @@ -1759,6 +1759,19 @@ int32_t __tgt_rtl_run_target_team_region_locked( KernelTy *KernelInfo = (KernelTy *)tgt_entry_ptr; + std::string kernel_name = std::string(KernelInfo->Name); + uint32_t sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count; + + { + assert(KernelInfoTable[device_id].find(kernel_name) != + KernelInfoTable[device_id].end()); + auto it = KernelInfoTable[device_id][kernel_name]; + sgpr_count = it.sgpr_count; + vgpr_count = it.vgpr_count; + sgpr_spill_count = it.sgpr_spill_count; + vgpr_spill_count = it.vgpr_spill_count; + } + /* * Set limit based on ThreadsPerGroup and GroupsPerDevice */ @@ -1780,10 +1793,12 @@ int32_t __tgt_rtl_run_target_team_region_locked( bool traceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); fprintf(traceToStdout ? stdout : stderr, "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " - "reqd:(%4dX%4d) n:%s\n", + "reqd:(%4dX%4d) sgpr_count:%u vgpr_count:%u sgpr_spill_count:%u " + "vgpr_spill_count:%u tripcount:%lu n:%s\n", device_id, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, arg_num, num_groups, threadsPerGroup, num_teams, thread_limit, - KernelInfo->Name); + sgpr_count, vgpr_count, sgpr_spill_count, vgpr_spill_count, + loop_tripcount, KernelInfo->Name); } // Run on the device. @@ -1812,7 +1827,6 @@ int32_t __tgt_rtl_run_target_team_region_locked( packet->reserved2 = 0; // atmi writes id_ here packet->completion_signal = {0}; // may want a pool of signals - std::string kernel_name = std::string(KernelInfo->Name); { assert(KernelInfoTable[device_id].find(kernel_name) != KernelInfoTable[device_id].end()); -- GitLab From 580416d573b6e5d33c09467084e382ac78f2a199 Mon Sep 17 00:00:00 2001 From: Christopher Di Bella Date: Wed, 17 Mar 2021 18:11:31 +0000 Subject: [PATCH 0059/1000] [libcxx] updates the feature-test macro generator D97015 didn't correctly update `generate_feature_test_macro_components.py`. Reviewed By: ldionne, Quuxplusone, #libc, Mordante Differential Revision: https://reviews.llvm.org/D97904 --- libcxx/include/version | 2 +- .../support.limits.general/numbers.version.pass.cpp | 8 ++++---- .../support.limits.general/version.version.pass.cpp | 8 ++++---- libcxx/utils/generate_feature_test_macro_components.py | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/libcxx/include/version b/libcxx/include/version index becbfa5c2cdb..469f1cea82b4 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -337,7 +337,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_latch 201907L # endif # define __cpp_lib_list_remove_return_type 201806L -# ifndef _LIBCPP_HAS_NO_CONCEPTS +# if !defined(_LIBCPP_HAS_NO_CONCEPTS) # define __cpp_lib_math_constants 201907L # endif // # define __cpp_lib_polymorphic_allocator 201902L diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.pass.cpp index aaa64d1f7feb..00752ded5cfb 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numbers.version.pass.cpp @@ -42,7 +42,7 @@ #elif TEST_STD_VER == 20 -# ifndef _LIBCPP_HAS_NO_CONCEPTS +# if defined(__cpp_concepts) && __cpp_concepts >= 201907L # ifndef __cpp_lib_math_constants # error "__cpp_lib_math_constants should be defined in c++20" # endif @@ -53,11 +53,11 @@ # ifdef __cpp_lib_math_constants # error "__cpp_lib_math_constants should not be defined when defined(__cpp_concepts) && __cpp_concepts >= 201907L is not defined!" # endif -# endif // _LIBCPP_HAS_NO_CONCEPTS +# endif #elif TEST_STD_VER > 20 -# ifndef _LIBCPP_HAS_NO_CONCEPTS +# if defined(__cpp_concepts) && __cpp_concepts >= 201907L # ifndef __cpp_lib_math_constants # error "__cpp_lib_math_constants should be defined in c++2b" # endif @@ -68,7 +68,7 @@ # ifdef __cpp_lib_math_constants # error "__cpp_lib_math_constants should not be defined when defined(__cpp_concepts) && __cpp_concepts >= 201907L is not defined!" # endif -# endif // _LIBCPP_HAS_NO_CONCEPTS +# endif #endif // TEST_STD_VER > 20 diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp index 3a668768d5e5..023f8c1b2317 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.pass.cpp @@ -2890,7 +2890,7 @@ # error "__cpp_lib_map_try_emplace should have the value 201411L in c++20" # endif -# ifndef _LIBCPP_HAS_NO_CONCEPTS +# if defined(__cpp_concepts) && __cpp_concepts >= 201907L # ifndef __cpp_lib_math_constants # error "__cpp_lib_math_constants should be defined in c++20" # endif @@ -2901,7 +2901,7 @@ # ifdef __cpp_lib_math_constants # error "__cpp_lib_math_constants should not be defined when defined(__cpp_concepts) && __cpp_concepts >= 201907L is not defined!" # endif -# endif // _LIBCPP_HAS_NO_CONCEPTS +# endif # if !defined(_LIBCPP_VERSION) # ifndef __cpp_lib_math_special_functions @@ -4104,7 +4104,7 @@ # error "__cpp_lib_map_try_emplace should have the value 201411L in c++2b" # endif -# if !_LIBCPP_HAS_NO_CONCEPTS +# if defined(__cpp_concepts) && __cpp_concepts >= 201907L # ifndef __cpp_lib_math_constants # error "__cpp_lib_math_constants should be defined in c++2b" # endif @@ -4115,7 +4115,7 @@ # ifdef __cpp_lib_math_constants # error "__cpp_lib_math_constants should not be defined when defined(__cpp_concepts) && __cpp_concepts >= 201907L is not defined!" # endif -# endif // !_LIBCPP_HAS_NO_CONCEPTS +# endif # if !defined(_LIBCPP_VERSION) # ifndef __cpp_lib_math_special_functions diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 7351da3b2a4d..ce0007610b08 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -422,7 +422,7 @@ feature_test_macros = [ add_version_header(x) for x in [ "values": { "c++20": 201907 }, "headers": ["numbers"], "depends": "defined(__cpp_concepts) && __cpp_concepts >= 201907L", - "internal_depends": "defined(__cpp_concepts) && __cpp_concepts >= 201907L", + "internal_depends": "!defined(_LIBCPP_HAS_NO_CONCEPTS)", }, { "name": "__cpp_lib_math_special_functions", "values": { "c++17": 201603 }, -- GitLab From 16c30c3c23ef02c0227256bb6f2005a574517de9 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 18 Mar 2021 10:18:19 -0700 Subject: [PATCH 0060/1000] [ELF] Change --shuffle-sections= to --shuffle-sections== `--shuffle-sections=` applies to all sections. The new `--shuffle-sections==` makes shuffling selective. To the best of my knowledge, the option is only used as debugging, so just drop the original form. `--shuffle-sections '.init_array*=-1'` `--shuffle-sections '.fini_array*=-1'`. reverses static constructors/destructors of the same priority. Useful to detect some static initialization order fiasco. `--shuffle-sections '.data*=-1'` reverses `.data*` sections. Useful to detect unfunded pointer comparison results of two unrelated objects. If certain sections have an intrinsic order, the old form cannot be used. Differential Revision: https://reviews.llvm.org/D98679 --- lld/ELF/Config.h | 2 +- lld/ELF/Driver.cpp | 20 +++++++++- lld/ELF/Options.td | 7 ++-- lld/ELF/Writer.cpp | 48 ++++++++++++++--------- lld/docs/ReleaseNotes.rst | 3 +- lld/docs/ld.lld.1 | 2 +- lld/test/ELF/gnu-ifunc-plt.s | 4 +- lld/test/ELF/shuffle-sections-init-fini.s | 10 ++--- lld/test/ELF/shuffle-sections.s | 34 +++++++++++++--- 9 files changed, 90 insertions(+), 40 deletions(-) diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index fcfe5f64c32f..ab55c60bb6f9 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -198,7 +198,7 @@ struct Configuration { bool relocatable; bool relrPackDynRelocs; bool saveTemps; - llvm::Optional shuffleSectionSeed; + std::vector> shuffleSections; bool singleRoRx; bool shared; bool symbolic; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index df9925d74f8a..3401c016dbe9 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1068,8 +1068,6 @@ static void readConfigs(opt::InputArgList &args) { config->rpath = getRpath(args); config->relocatable = args.hasArg(OPT_relocatable); config->saveTemps = args.hasArg(OPT_save_temps); - if (args.hasArg(OPT_shuffle_sections)) - config->shuffleSectionSeed = args::getInteger(args, OPT_shuffle_sections, 0); config->searchPaths = args::getStrings(args, OPT_library_path); config->sectionStartMap = getSectionStartMap(args); config->shared = args.hasArg(OPT_shared); @@ -1149,6 +1147,24 @@ static void readConfigs(opt::InputArgList &args) { config->optEL = true; } + for (opt::Arg *arg : args.filtered(OPT_shuffle_sections)) { + constexpr StringRef errPrefix = "--shuffle-sections=: "; + std::pair kv = StringRef(arg->getValue()).split('='); + if (kv.first.empty() || kv.second.empty()) { + error(errPrefix + "expected =, but got '" + + arg->getValue() + "'"); + continue; + } + // Signed so that =-1 is allowed. + int64_t v; + if (!to_integer(kv.second, v)) + error(errPrefix + "expected an integer, but got '" + kv.second + "'"); + else if (Expected pat = GlobPattern::create(kv.first)) + config->shuffleSections.emplace_back(std::move(*pat), uint32_t(v)); + else + error(errPrefix + toString(pat.takeError())); + } + for (opt::Arg *arg : args.filtered(OPT_z)) { std::pair option = StringRef(arg->getValue()).split('='); diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index ee4a0610d362..55bde53cddcb 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -586,9 +586,10 @@ def lto_basic_block_sections: JJ<"lto-basic-block-sections=">, defm lto_unique_basic_block_section_names: BB<"lto-unique-basic-block-section-names", "Give unique names to every basic block section for LTO", "Do not give unique names to every basic block section for LTO (default)">; -def shuffle_sections: JJ<"shuffle-sections=">, MetaVarName<"">, - HelpText<"Shuffle input sections using the given seed. " - "If -1, reverse the section order. If 0, use a random seed">; +defm shuffle_sections: EEq<"shuffle-sections", + "Shuffle matched sections using the given seed before mapping them to the output sections. " + "If -1, reverse the section order. If 0, use a random seed">, + MetaVarName<"=">; def thinlto_cache_dir: JJ<"thinlto-cache-dir=">, HelpText<"Path to ThinLTO cached object file directory">; defm thinlto_cache_policy: EEq<"thinlto-cache-policy", "Pruning policy for the ThinLTO cache">; diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index f0d4e6e4e685..5f5f7ccb4d35 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -1291,29 +1291,39 @@ findOrphanPos(std::vector::iterator b, // Adds random priorities to sections not already in the map. static void maybeShuffle(DenseMap &order) { - if (!config->shuffleSectionSeed) + if (config->shuffleSections.empty()) return; - std::vector priorities(inputSections.size() - order.size()); + std::vector matched, sections = inputSections; + matched.reserve(sections.size()); + for (const auto &patAndSeed : config->shuffleSections) { + matched.clear(); + for (InputSectionBase *sec : sections) + if (patAndSeed.first.match(sec->name)) + matched.push_back(sec); + const uint32_t seed = patAndSeed.second; + if (seed == UINT32_MAX) { + // If --shuffle-sections =-1, reverse the section order. The + // section order is stable even if the number of sections changes. This is + // useful to catch issues like static initialization order fiasco + // reliably. + std::reverse(matched.begin(), matched.end()); + } else { + std::mt19937 g(seed ? seed : std::random_device()()); + llvm::shuffle(matched.begin(), matched.end(), g); + } + size_t i = 0; + for (InputSectionBase *&sec : sections) + if (patAndSeed.first.match(sec->name)) + sec = matched[i++]; + } + // Existing priorities are < 0, so use priorities >= 0 for the missing // sections. - int curPrio = 0; - for (int &prio : priorities) - prio = curPrio++; - uint32_t seed = *config->shuffleSectionSeed; - if (seed == UINT32_MAX) { - // If --shuffle-sections=-1, reverse the section order. The section order is - // stable even if the number of sections changes. This is useful to catch - // issues like static initialization order fiasco reliably. - std::reverse(priorities.begin(), priorities.end()); - } else { - std::mt19937 g(seed ? seed : std::random_device()()); - llvm::shuffle(priorities.begin(), priorities.end(), g); - } - int prioIndex = 0; - for (InputSectionBase *sec : inputSections) { - if (order.try_emplace(sec, priorities[prioIndex]).second) - ++prioIndex; + int prio = 0; + for (InputSectionBase *sec : sections) { + if (order.try_emplace(sec, prio).second) + ++prio; } } diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst index 3684e99cb80c..a3b577e48fb1 100644 --- a/lld/docs/ReleaseNotes.rst +++ b/lld/docs/ReleaseNotes.rst @@ -29,7 +29,8 @@ ELF Improvements Breaking changes ---------------- -* ... +* ``--shuffle-sections=`` has been changed to ``--shuffle-sections==``. + Specify ``*`` as ```` to get the previous behavior. COFF Improvements ----------------- diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 index 3c1704c0c5e8..37c42a0eb51f 100644 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -487,7 +487,7 @@ Set address of section. .It Fl -shared , Fl -Bsharable Build a shared object. .It Fl -shuffle-sections Ns = Ns Ar seed -Shuffle input sections using the given seed. +Shuffle matched sections using the given seed before mapping them to the output sections. If -1, reverse the section order. If 0, use a random seed. .It Fl -soname Ns = Ns Ar value , Fl h Ar value Set diff --git a/lld/test/ELF/gnu-ifunc-plt.s b/lld/test/ELF/gnu-ifunc-plt.s index 540bfbc5325c..58fae803a0e5 100644 --- a/lld/test/ELF/gnu-ifunc-plt.s +++ b/lld/test/ELF/gnu-ifunc-plt.s @@ -80,9 +80,9 @@ // Test that --shuffle-sections does not affect the order of relocations and that // we still place IRELATIVE relocations last. Check both random seed (0) and an // arbitrary seed that was known to break the order of relocations previously (3). -// RUN: ld.lld --shuffle-sections=3 %t.so %t.o -o %tout2 +// RUN: ld.lld --shuffle-sections='*=3' %t.so %t.o -o %tout2 // RUN: llvm-readobj --relocations %tout2 | FileCheck %s --check-prefix=SHUFFLE -// RUN: ld.lld --shuffle-sections=0 %t.so %t.o -o %tout3 +// RUN: ld.lld --shuffle-sections='*=0' %t.so %t.o -o %tout3 // RUN: llvm-readobj --relocations %tout3 | FileCheck %s --check-prefix=SHUFFLE // SHUFFLE: Section {{.*}} .rela.dyn { diff --git a/lld/test/ELF/shuffle-sections-init-fini.s b/lld/test/ELF/shuffle-sections-init-fini.s index d98ca8d359de..4ddbf6cb7483 100644 --- a/lld/test/ELF/shuffle-sections-init-fini.s +++ b/lld/test/ELF/shuffle-sections-init-fini.s @@ -5,7 +5,7 @@ # RUN: llvm-readelf -x .init -x .fini -x .init_array -x .fini_array %t | \ # RUN: FileCheck --check-prefixes=CHECK,ORDERED %s -# RUN: ld.lld %t.o --shuffle-sections=1 -o %t1 +# RUN: ld.lld %t.o --shuffle-sections '*=1' -o %t1 # RUN: llvm-readelf -x .init -x .fini -x .init_array -x .fini_array %t1 | \ # RUN: FileCheck --check-prefixes=CHECK,SHUFFLED %s @@ -21,12 +21,12 @@ # CHECK: Hex dump of section '.init_array' # CHECK-NEXT: 0x{{[0-9a-f]+}} ff # ORDERED-SAME: 000102 03040506 0708090a 0b -# SHUFFLED-SAME: 04000b 06010a08 09070203 05 +# SHUFFLED-SAME: 080301 04050907 0b020a06 00 # CHECK: Hex dump of section '.fini_array' # CHECK-NEXT: 0x{{[0-9a-f]+}} ff # ORDERED-SAME: 000102 03040506 0708090a 0b -# SHUFFLED-SAME: 090401 070b0003 080a0605 02 +# SHUFFLED-SAME: 0a0405 08070b02 03090006 01 ## With a SECTIONS command, SHT_INIT_ARRAY prirotities are ignored. ## All .init_array* are shuffled together. @@ -36,13 +36,13 @@ # RUN: ld.lld -T %t.script %t.o -o %t2 # RUN: llvm-readelf -x .init -x .fini -x .init_array -x .fini_array %t2 | \ # RUN: FileCheck --check-prefixes=CHECK2,ORDERED2 %s -# RUN: ld.lld -T %t.script %t.o --shuffle-sections=1 -o %t3 +# RUN: ld.lld -T %t.script %t.o --shuffle-sections '*=1' -o %t3 # RUN: llvm-readelf -x .init -x .fini -x .init_array -x .fini_array %t3 | \ # RUN: FileCheck --check-prefixes=CHECK2,SHUFFLED2 %s # CHECK2: Hex dump of section '.init_array' # ORDERED2-NEXT: 0x{{[0-9a-f]+}} 00010203 04050607 08090a0b ff -# SHUFFLED2-NEXT: 0x{{[0-9a-f]+}} 04000b06 010a0809 07ff0203 05 +# SHUFFLED2-NEXT: 0x{{[0-9a-f]+}} 08030104 0509070b 02ff0a06 00 .irp i,0,1,2,3,4,5,6,7,8,9,10,11 .section .init,"ax",@progbits,unique,\i diff --git a/lld/test/ELF/shuffle-sections.s b/lld/test/ELF/shuffle-sections.s index 59b0642d639c..8211c482732b 100644 --- a/lld/test/ELF/shuffle-sections.s +++ b/lld/test/ELF/shuffle-sections.s @@ -7,31 +7,53 @@ # CHECK-NEXT: 01020304 ## --shuffle-sections= shuffles input sections. -# RUN: ld.lld --shuffle-sections=1 %t.o -o %t1.out +# RUN: ld.lld --shuffle-sections='*=1' %t.o -o %t1.out # RUN: llvm-readelf -x .text %t1.out | FileCheck %s --check-prefix=SHUFFLE1 # SHUFFLE1: Hex dump of section '.text': -# SHUFFLE1-NEXT: 0204cccc 0103 +# SHUFFLE1-NEXT: 0203cccc 0104 ## Test that --shuffle-sections= can be used with --symbol-ordering-file # RUN: echo "foo" > %t_order.txt # RUN: echo "_start " >> %t_order.txt -# RUN: ld.lld --symbol-ordering-file %t_order.txt --shuffle-sections=2 %t.o -o %t2.out +# RUN: ld.lld --symbol-ordering-file %t_order.txt --shuffle-sections='*=2' %t.o -o %t2.out # RUN: llvm-readelf -x .text %t2.out | FileCheck %s --check-prefix=SHUFFLE2 # SHUFFLE2: Hex dump of section '.text': -# SHUFFLE2-NEXT: 02cccccc 010304 +# SHUFFLE2-NEXT: 02cccccc 010403 -# RUN: ld.lld --symbol-ordering-file %t_order.txt --shuffle-sections=3 %t.o -o %t3.out +# RUN: ld.lld --symbol-ordering-file %t_order.txt --shuffle-sections='*=3' %t.o -o %t3.out # RUN: llvm-readelf -x .text %t3.out | FileCheck %s --check-prefix=SHUFFLE3 # SHUFFLE3: Hex dump of section '.text': # SHUFFLE3-NEXT: 02cccccc 010403 ## As a special case, -1 reverses sections as a stable transform. -# RUN: ld.lld --shuffle-sections=-1 %t.o -o %t-1.out +# RUN: ld.lld --shuffle-sections '*=-1' %t.o -o %t-1.out # RUN: llvm-readelf -x .text %t-1.out | FileCheck %s --check-prefix=SHUFFLE-1 # SHUFFLE-1: Hex dump of section '.text': # SHUFFLE-1-NEXT: 040302cc 01 +## .text does not change its order while .text.{foo,bar,zed} are reversed. +# RUN: ld.lld --shuffle-sections '.text.*=-1' %t.o -o %t4.out +# RUN: llvm-readelf -x .text %t4.out | FileCheck %s --check-prefix=SHUFFLE4 +# SHUFFLE4: Hex dump of section '.text': +# SHUFFLE4-NEXT: 01040302 + +## Reversing twice restores the original order. +# RUN: ld.lld --shuffle-sections '.text.*=-1' --shuffle-sections '.text.*=-1' %t.o -o %t.out +# RUN: llvm-readelf -x .text %t.out | FileCheck %s + +## Test all possible invalid cases. +# RUN: not ld.lld --shuffle-sections= 2>&1 | FileCheck %s --check-prefix=USAGE -DV= +# RUN: not ld.lld --shuffle-sections=a= 2>&1 | FileCheck %s --check-prefix=USAGE -DV=a= +# RUN: not ld.lld --shuffle-sections==0 2>&1 | FileCheck %s --check-prefix=USAGE -DV==0 +# RUN: not ld.lld --shuffle-sections=a 2>&1 | FileCheck %s --check-prefix=USAGE -DV=a + +# USAGE: error: --shuffle-sections=: expected =, but got '[[V]]' + +# RUN: not ld.lld --shuffle-sections='['=0 2>&1 | FileCheck %s --check-prefix=INVALID + +# INVALID: error: --shuffle-sections=: invalid glob pattern: [ + ## .text has an alignment of 4. .global _start _start: -- GitLab From 0d6482a76adda7a79db343b020e5f62196999ae6 Mon Sep 17 00:00:00 2001 From: Peter Waller Date: Thu, 11 Mar 2021 17:29:32 +0000 Subject: [PATCH 0061/1000] [llvm][AArch64][SVE] Lower fixed length vector fabs Seemingly striaghtforward. Differential Revision: https://reviews.llvm.org/D98434 --- .../Target/AArch64/AArch64ISelLowering.cpp | 1 + .../AArch64/sve-fixed-length-fp-arith.ll | 233 ++++++++++++++++++ 2 files changed, 234 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e3c928e1b79b..757d838ad3fe 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1397,6 +1397,7 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::FABS, VT, Custom); setOperationAction(ISD::FADD, VT, Custom); setOperationAction(ISD::FCEIL, VT, Custom); setOperationAction(ISD::FDIV, VT, Custom); diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll index fdd0acd97024..667513b77e43 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-fp-arith.ll @@ -1710,6 +1710,220 @@ define void @fsub_v32f64(<32 x double>* %a, <32 x double>* %b) #0 { ret void } +; +; FABS +; + +; Don't use SVE for 64-bit vectors. +define <4 x half> @fabs_v4f16(<4 x half> %op) #0 { +; CHECK-LABEL: fabs_v4f16: +; CHECK: fabs v0.4h, v0.4h +; CHECK: ret + %res = call <4 x half> @llvm.fabs.v4f16(<4 x half> %op) + ret <4 x half> %res +} + +; Don't use SVE for 128-bit vectors. +define <8 x half> @fabs_v8f16(<8 x half> %op) #0 { +; CHECK-LABEL: fabs_v8f16: +; CHECK: fabs v0.8h, v0.8h +; CHECK: ret + %res = call <8 x half> @llvm.fabs.v8f16(<8 x half> %op) + ret <8 x half> %res +} + +define void @fabs_v16f16(<16 x half>* %a) #0 { +; CHECK-LABEL: fabs_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),16)]] +; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h +; CHECK: st1h { [[RES]].h }, [[PG]], [x0] +; CHECK: ret + %op = load <16 x half>, <16 x half>* %a + %res = call <16 x half> @llvm.fabs.v16f16(<16 x half> %op) + store <16 x half> %res, <16 x half>* %a + ret void +} + +define void @fabs_v32f16(<32 x half>* %a) #0 { +; CHECK-LABEL: fabs_v32f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),32)]] +; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h +; CHECK: st1h { [[RES]].h }, [[PG]], [x0] +; CHECK: ret + %op = load <32 x half>, <32 x half>* %a + %res = call <32 x half> @llvm.fabs.v32f16(<32 x half> %op) + store <32 x half> %res, <32 x half>* %a + ret void +} + +define void @fabs_v64f16(<64 x half>* %a) #0 { +; CHECK-LABEL: fabs_v64f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),64)]] +; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h +; CHECK: st1h { [[RES]].h }, [[PG]], [x0] +; CHECK: ret + %op = load <64 x half>, <64 x half>* %a + %res = call <64 x half> @llvm.fabs.v64f16(<64 x half> %op) + store <64 x half> %res, <64 x half>* %a + ret void +} + +define void @fabs_v128f16(<128 x half>* %a) #0 { +; CHECK-LABEL: fabs_v128f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl[[#min(div(VBYTES,2),128)]] +; CHECK: ld1h { [[OP:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].h, [[PG]]/m, [[OP]].h +; CHECK: st1h { [[RES]].h }, [[PG]], [x0] +; CHECK: ret + %op = load <128 x half>, <128 x half>* %a + %res = call <128 x half> @llvm.fabs.v128f16(<128 x half> %op) + store <128 x half> %res, <128 x half>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <2 x float> @fabs_v2f32(<2 x float> %op) #0 { +; CHECK-LABEL: fabs_v2f32: +; CHECK: fabs v0.2s, v0.2s +; CHECK: ret + %res = call <2 x float> @llvm.fabs.v2f32(<2 x float> %op) + ret <2 x float> %res +} + +; Don't use SVE for 128-bit vectors. +define <4 x float> @fabs_v4f32(<4 x float> %op) #0 { +; CHECK-LABEL: fabs_v4f32: +; CHECK: fabs v0.4s, v0.4s +; CHECK: ret + %res = call <4 x float> @llvm.fabs.v4f32(<4 x float> %op) + ret <4 x float> %res +} + +define void @fabs_v8f32(<8 x float>* %a) #0 { +; CHECK-LABEL: fabs_v8f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),8)]] +; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s +; CHECK: st1w { [[RES]].s }, [[PG]], [x0] +; CHECK: ret + %op = load <8 x float>, <8 x float>* %a + %res = call <8 x float> @llvm.fabs.v8f32(<8 x float> %op) + store <8 x float> %res, <8 x float>* %a + ret void +} + +define void @fabs_v16f32(<16 x float>* %a) #0 { +; CHECK-LABEL: fabs_v16f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),16)]] +; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s +; CHECK: st1w { [[RES]].s }, [[PG]], [x0] +; CHECK: ret + %op = load <16 x float>, <16 x float>* %a + %res = call <16 x float> @llvm.fabs.v16f32(<16 x float> %op) + store <16 x float> %res, <16 x float>* %a + ret void +} + +define void @fabs_v32f32(<32 x float>* %a) #0 { +; CHECK-LABEL: fabs_v32f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),32)]] +; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s +; CHECK: st1w { [[RES]].s }, [[PG]], [x0] +; CHECK: ret + %op = load <32 x float>, <32 x float>* %a + %res = call <32 x float> @llvm.fabs.v32f32(<32 x float> %op) + store <32 x float> %res, <32 x float>* %a + ret void +} + +define void @fabs_v64f32(<64 x float>* %a) #0 { +; CHECK-LABEL: fabs_v64f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl[[#min(div(VBYTES,4),64)]] +; CHECK: ld1w { [[OP:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].s, [[PG]]/m, [[OP]].s +; CHECK: st1w { [[RES]].s }, [[PG]], [x0] +; CHECK: ret + %op = load <64 x float>, <64 x float>* %a + %res = call <64 x float> @llvm.fabs.v64f32(<64 x float> %op) + store <64 x float> %res, <64 x float>* %a + ret void +} + +; Don't use SVE for 64-bit vectors. +define <1 x double> @fabs_v1f64(<1 x double> %op) #0 { +; CHECK-LABEL: fabs_v1f64: +; CHECK: fabs d0, d0 +; CHECK: ret + %res = call <1 x double> @llvm.fabs.v1f64(<1 x double> %op) + ret <1 x double> %res +} + +; Don't use SVE for 128-bit vectors. +define <2 x double> @fabs_v2f64(<2 x double> %op) #0 { +; CHECK-LABEL: fabs_v2f64: +; CHECK: fabs v0.2d, v0.2d +; CHECK: ret + %res = call <2 x double> @llvm.fabs.v2f64(<2 x double> %op) + ret <2 x double> %res +} + +define void @fabs_v4f64(<4 x double>* %a) #0 { +; CHECK-LABEL: fabs_v4f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),4)]] +; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d +; CHECK: st1d { [[RES]].d }, [[PG]], [x0] +; CHECK: ret + %op = load <4 x double>, <4 x double>* %a + %res = call <4 x double> @llvm.fabs.v4f64(<4 x double> %op) + store <4 x double> %res, <4 x double>* %a + ret void +} + +define void @fabs_v8f64(<8 x double>* %a) #0 { +; CHECK-LABEL: fabs_v8f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),8)]] +; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d +; CHECK: st1d { [[RES]].d }, [[PG]], [x0] +; CHECK: ret + %op = load <8 x double>, <8 x double>* %a + %res = call <8 x double> @llvm.fabs.v8f64(<8 x double> %op) + store <8 x double> %res, <8 x double>* %a + ret void +} + +define void @fabs_v16f64(<16 x double>* %a) #0 { +; CHECK-LABEL: fabs_v16f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),16)]] +; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d +; CHECK: st1d { [[RES]].d }, [[PG]], [x0] +; CHECK: ret + %op = load <16 x double>, <16 x double>* %a + %res = call <16 x double> @llvm.fabs.v16f64(<16 x double> %op) + store <16 x double> %res, <16 x double>* %a + ret void +} + +define void @fabs_v32f64(<32 x double>* %a) #0 { +; CHECK-LABEL: fabs_v32f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl[[#min(div(VBYTES,8),32)]] +; CHECK: ld1d { [[OP:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK: fabs [[RES:z[0-9]+]].d, [[PG]]/m, [[OP]].d +; CHECK: st1d { [[RES]].d }, [[PG]], [x0] +; CHECK: ret + %op = load <32 x double>, <32 x double>* %a + %res = call <32 x double> @llvm.fabs.v32f64(<32 x double> %op) + store <32 x double> %res, <32 x double>* %a + ret void +} + attributes #0 = { "target-features"="+sve" } declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) @@ -1749,3 +1963,22 @@ declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) declare <8 x double> @llvm.sqrt.v8f64(<8 x double>) declare <16 x double> @llvm.sqrt.v16f64(<16 x double>) declare <32 x double> @llvm.sqrt.v32f64(<32 x double>) + +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) +declare <16 x half> @llvm.fabs.v16f16(<16 x half>) +declare <32 x half> @llvm.fabs.v32f16(<32 x half>) +declare <64 x half> @llvm.fabs.v64f16(<64 x half>) +declare <128 x half> @llvm.fabs.v128f16(<128 x half>) +declare <2 x float> @llvm.fabs.v2f32(<2 x float>) +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) +declare <8 x float> @llvm.fabs.v8f32(<8 x float>) +declare <16 x float> @llvm.fabs.v16f32(<16 x float>) +declare <32 x float> @llvm.fabs.v32f32(<32 x float>) +declare <64 x float> @llvm.fabs.v64f32(<64 x float>) +declare <1 x double> @llvm.fabs.v1f64(<1 x double>) +declare <2 x double> @llvm.fabs.v2f64(<2 x double>) +declare <4 x double> @llvm.fabs.v4f64(<4 x double>) +declare <8 x double> @llvm.fabs.v8f64(<8 x double>) +declare <16 x double> @llvm.fabs.v16f64(<16 x double>) +declare <32 x double> @llvm.fabs.v32f64(<32 x double>) -- GitLab From 8638c897f469dbd1d95b2e46b39ab72fb7b9d336 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 18 Mar 2021 10:23:12 -0700 Subject: [PATCH 0062/1000] [WebAssembly] Remove unimplemented-simd target feature Now that the WebAssembly SIMD specification is finalized and engines are generally up-to-date, there is no need for a separate target feature for gating SIMD instructions that engines have not implemented. With this change, v128.const is now enabled by default with the simd128 target feature. Differential Revision: https://reviews.llvm.org/D98457 --- clang/docs/ClangCommandLineReference.rst | 1 - .../clang/Basic/BuiltinsWebAssembly.def | 12 +- clang/include/clang/Driver/Options.td | 2 - clang/lib/Basic/Targets/WebAssembly.cpp | 19 - clang/lib/Basic/Targets/WebAssembly.h | 1 - clang/test/CodeGen/builtins-wasm.c | 4 +- .../test/Preprocessor/wasm-target-features.c | 10 - llvm/lib/Target/WebAssembly/WebAssembly.td | 6 - .../WebAssembly/WebAssemblyFastISel.cpp | 5 +- .../WebAssembly/WebAssemblyISelLowering.cpp | 57 +- .../WebAssembly/WebAssemblyInstrInfo.td | 4 - .../WebAssembly/WebAssemblyInstrSIMD.td | 3 +- .../Target/WebAssembly/WebAssemblySubtarget.h | 4 - llvm/test/CodeGen/WebAssembly/simd-arith.ll | 13 +- .../CodeGen/WebAssembly/simd-build-vector.ll | 134 +---- .../CodeGen/WebAssembly/simd-comparisons.ll | 3 +- .../CodeGen/WebAssembly/simd-conversions.ll | 11 +- .../CodeGen/WebAssembly/simd-intrinsics.ll | 550 +++++++++--------- .../CodeGen/WebAssembly/simd-load-splat.ll | 2 +- .../WebAssembly/simd-load-store-alignment.ll | 2 +- llvm/test/CodeGen/WebAssembly/simd-noopt.ll | 20 - .../CodeGen/WebAssembly/simd-reductions.ll | 146 ++--- llvm/test/CodeGen/WebAssembly/simd-select.ll | 2 +- .../CodeGen/WebAssembly/simd-sext-inreg.ll | 2 +- .../CodeGen/WebAssembly/simd-unsupported.ll | 2 +- llvm/test/CodeGen/WebAssembly/simd.ll | 9 +- llvm/test/MC/WebAssembly/basic-assembly.s | 4 +- llvm/test/MC/WebAssembly/data-section.s | 8 +- llvm/test/MC/WebAssembly/simd-encodings.s | 2 +- llvm/test/MC/WebAssembly/type-index.s | 4 +- llvm/test/MC/WebAssembly/types.ll | 2 +- llvm/test/MC/WebAssembly/wasm64.s | 4 +- 32 files changed, 413 insertions(+), 635 deletions(-) delete mode 100644 llvm/test/CodeGen/WebAssembly/simd-noopt.ll diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst index 0038dccd53f9..bca5722f80d0 100644 --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -3798,4 +3798,3 @@ undef all system defines .. option:: -z Pass -z to the linker - diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index 84f346bcb928..38de66587cba 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -151,11 +151,11 @@ TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16ScV16ScV16ScIiIiIiIiIiIiIiIiIiI TARGET_BUILTIN(__builtin_wasm_any_true_i8x16, "iV16Sc", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_any_true_i16x8, "iV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_any_true_i32x4, "iV4i", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_any_true_i64x2, "iV2LLi", "nc", "unimplemented-simd128") +TARGET_BUILTIN(__builtin_wasm_any_true_i64x2, "iV2LLi", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_all_true_i8x16, "iV16Sc", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_all_true_i16x8, "iV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_all_true_i32x4, "iV4i", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_all_true_i64x2, "iV2LLi", "nc", "unimplemented-simd128") +TARGET_BUILTIN(__builtin_wasm_all_true_i64x2, "iV2LLi", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_bitmask_i8x16, "iV16Sc", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_bitmask_i16x8, "iV8s", "nc", "simd128") @@ -188,10 +188,10 @@ TARGET_BUILTIN(__builtin_wasm_dot_s_i32x4_i16x8, "V4iV8sV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_sqrt_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_sqrt_f64x2, "V2dV2d", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_qfma_f32x4, "V4fV4fV4fV4f", "nc", "unimplemented-simd128") -TARGET_BUILTIN(__builtin_wasm_qfms_f32x4, "V4fV4fV4fV4f", "nc", "unimplemented-simd128") -TARGET_BUILTIN(__builtin_wasm_qfma_f64x2, "V2dV2dV2dV2d", "nc", "unimplemented-simd128") -TARGET_BUILTIN(__builtin_wasm_qfms_f64x2, "V2dV2dV2dV2d", "nc", "unimplemented-simd128") +TARGET_BUILTIN(__builtin_wasm_qfma_f32x4, "V4fV4fV4fV4f", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_qfms_f32x4, "V4fV4fV4fV4f", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_qfma_f64x2, "V2dV2dV2dV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_qfms_f64x2, "V2dV2dV2dV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i32x4_f32x4, "V4iV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i32x4_f32x4, "V4iV4f", "nc", "simd128") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 55dddab6160c..9c5013ee88d9 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3124,8 +3124,6 @@ def mharden_sls_EQ : Joined<["-"], "mharden-sls=">, HelpText<"Select straight-line speculation hardening scope">; def msimd128 : Flag<["-"], "msimd128">, Group; -def munimplemented_simd128 : Flag<["-"], "munimplemented-simd128">, Group; -def mno_unimplemented_simd128 : Flag<["-"], "mno-unimplemented-simd128">, Group; def mno_simd128 : Flag<["-"], "mno-simd128">, Group; def mnontrapping_fptoint : Flag<["-"], "mnontrapping-fptoint">, Group; def mno_nontrapping_fptoint : Flag<["-"], "mno-nontrapping-fptoint">, Group; diff --git a/clang/lib/Basic/Targets/WebAssembly.cpp b/clang/lib/Basic/Targets/WebAssembly.cpp index 89babe85794d..2a5055c3d534 100644 --- a/clang/lib/Basic/Targets/WebAssembly.cpp +++ b/clang/lib/Basic/Targets/WebAssembly.cpp @@ -46,7 +46,6 @@ bool WebAssemblyTargetInfo::setABI(const std::string &Name) { bool WebAssemblyTargetInfo::hasFeature(StringRef Feature) const { return llvm::StringSwitch(Feature) .Case("simd128", SIMDLevel >= SIMD128) - .Case("unimplemented-simd128", SIMDLevel >= UnimplementedSIMD128) .Case("nontrapping-fptoint", HasNontrappingFPToInt) .Case("sign-ext", HasSignExt) .Case("exception-handling", HasExceptionHandling) @@ -73,8 +72,6 @@ void WebAssemblyTargetInfo::getTargetDefines(const LangOptions &Opts, defineCPUMacros(Builder, "wasm", /*Tuning=*/false); if (SIMDLevel >= SIMD128) Builder.defineMacro("__wasm_simd128__"); - if (SIMDLevel >= UnimplementedSIMD128) - Builder.defineMacro("__wasm_unimplemented_simd128__"); if (HasNontrappingFPToInt) Builder.defineMacro("__wasm_nontrapping_fptoint__"); if (HasSignExt) @@ -99,9 +96,6 @@ void WebAssemblyTargetInfo::setSIMDLevel(llvm::StringMap &Features, SIMDEnum Level, bool Enabled) { if (Enabled) { switch (Level) { - case UnimplementedSIMD128: - Features["unimplemented-simd128"] = true; - LLVM_FALLTHROUGH; case SIMD128: Features["simd128"] = true; LLVM_FALLTHROUGH; @@ -115,9 +109,6 @@ void WebAssemblyTargetInfo::setSIMDLevel(llvm::StringMap &Features, case NoSIMD: case SIMD128: Features["simd128"] = false; - LLVM_FALLTHROUGH; - case UnimplementedSIMD128: - Features["unimplemented-simd128"] = false; break; } } @@ -127,8 +118,6 @@ void WebAssemblyTargetInfo::setFeatureEnabled(llvm::StringMap &Features, bool Enabled) const { if (Name == "simd128") setSIMDLevel(Features, SIMD128, Enabled); - else if (Name == "unimplemented-simd128") - setSIMDLevel(Features, UnimplementedSIMD128, Enabled); else Features[Name] = Enabled; } @@ -160,14 +149,6 @@ bool WebAssemblyTargetInfo::handleTargetFeatures( SIMDLevel = std::min(SIMDLevel, SIMDEnum(SIMD128 - 1)); continue; } - if (Feature == "+unimplemented-simd128") { - SIMDLevel = std::max(SIMDLevel, SIMDEnum(UnimplementedSIMD128)); - continue; - } - if (Feature == "-unimplemented-simd128") { - SIMDLevel = std::min(SIMDLevel, SIMDEnum(UnimplementedSIMD128 - 1)); - continue; - } if (Feature == "+nontrapping-fptoint") { HasNontrappingFPToInt = true; continue; diff --git a/clang/lib/Basic/Targets/WebAssembly.h b/clang/lib/Basic/Targets/WebAssembly.h index 9150d849f601..be5b66a9580b 100644 --- a/clang/lib/Basic/Targets/WebAssembly.h +++ b/clang/lib/Basic/Targets/WebAssembly.h @@ -27,7 +27,6 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyTargetInfo : public TargetInfo { enum SIMDEnum { NoSIMD, SIMD128, - UnimplementedSIMD128, } SIMDLevel = NoSIMD; bool HasNontrappingFPToInt = false; diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index 771764c85d6b..124b09633693 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -1,5 +1,5 @@ -// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +unimplemented-simd128 -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY32 -// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +unimplemented-simd128 -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY64 +// RUN: %clang_cc1 -triple wasm32-unknown-unknown -target-feature +simd128 -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY32 +// RUN: %clang_cc1 -triple wasm64-unknown-unknown -target-feature +simd128 -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s | FileCheck %s -check-prefixes WEBASSEMBLY,WEBASSEMBLY64 // RUN: not %clang_cc1 -triple wasm64-unknown-unknown -target-feature +nontrapping-fptoint -target-feature +exception-handling -target-feature +bulk-memory -target-feature +atomics -flax-vector-conversions=none -O3 -emit-llvm -o - %s 2>&1 | FileCheck %s -check-prefixes MISSING-SIMD // SIMD convenience types diff --git a/clang/test/Preprocessor/wasm-target-features.c b/clang/test/Preprocessor/wasm-target-features.c index 05b4bb49d73b..29cc3071a235 100644 --- a/clang/test/Preprocessor/wasm-target-features.c +++ b/clang/test/Preprocessor/wasm-target-features.c @@ -7,15 +7,6 @@ // // SIMD128:#define __wasm_simd128__ 1{{$}} -// RUN: %clang -E -dM %s -o - 2>&1 \ -// RUN: -target wasm32-unknown-unknown -munimplemented-simd128 \ -// RUN: | FileCheck %s -check-prefix=SIMD128-UNIMPLEMENTED -// RUN: %clang -E -dM %s -o - 2>&1 \ -// RUN: -target wasm64-unknown-unknown -munimplemented-simd128 \ -// RUN: | FileCheck %s -check-prefix=SIMD128-UNIMPLEMENTED -// -// SIMD128-UNIMPLEMENTED:#define __wasm_unimplemented_simd128__ 1{{$}} - // RUN: %clang -E -dM %s -o - 2>&1 \ // RUN: -target wasm32-unknown-unknown -mnontrapping-fptoint \ // RUN: | FileCheck %s -check-prefix=NONTRAPPING-FPTOINT @@ -114,7 +105,6 @@ // RUN: | FileCheck %s -check-prefix=MVP // // MVP-NOT:#define __wasm_simd128__ -// MVP-NOT:#define __wasm_unimplemented_simd128__ // MVP-NOT:#define __wasm_nontrapping_fptoint__ // MVP-NOT:#define __wasm_sign_ext__ // MVP-NOT:#define __wasm_exception_handling__ diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td index 2c18bf2c3abe..c1872dd91c58 100644 --- a/llvm/lib/Target/WebAssembly/WebAssembly.td +++ b/llvm/lib/Target/WebAssembly/WebAssembly.td @@ -25,12 +25,6 @@ include "llvm/Target/Target.td" def FeatureSIMD128 : SubtargetFeature<"simd128", "SIMDLevel", "SIMD128", "Enable 128-bit SIMD">; -def FeatureUnimplementedSIMD128 : - SubtargetFeature<"unimplemented-simd128", - "SIMDLevel", "UnimplementedSIMD128", - "Enable 128-bit SIMD not yet implemented in engines", - [FeatureSIMD128]>; - def FeatureAtomics : SubtargetFeature<"atomics", "HasAtomics", "true", "Enable Atomics">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 994baf797c7c..5b54ffdc2511 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -139,12 +139,9 @@ private: case MVT::v8i16: case MVT::v4i32: case MVT::v4f32: - if (Subtarget->hasSIMD128()) - return VT; - break; case MVT::v2i64: case MVT::v2f64: - if (Subtarget->hasUnimplementedSIMD128()) + if (Subtarget->hasSIMD128()) return VT; break; default: diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index a515a5f4aef2..8cf44b545e06 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1705,18 +1705,14 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, std::function IsLaneConstructed; SDValue Result; // Prefer swizzles over vector consts over splats - if (NumSwizzleLanes >= NumSplatLanes && - (!Subtarget->hasUnimplementedSIMD128() || - NumSwizzleLanes >= NumConstantLanes)) { + if (NumSwizzleLanes >= NumSplatLanes && NumSwizzleLanes >= NumConstantLanes) { Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc, SwizzleIndices); auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices); IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) { return Swizzled == GetSwizzleSrcs(I, Lane); }; - } else if (NumConstantLanes >= NumSplatLanes && - Subtarget->hasUnimplementedSIMD128()) { - // If we support v128.const, emit it directly + } else if (NumConstantLanes >= NumSplatLanes) { SmallVector ConstLanes; for (const SDValue &Lane : Op->op_values()) { if (IsConstant(Lane)) { @@ -1731,55 +1727,6 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) { return IsConstant(Lane); }; - } else if (NumConstantLanes >= NumSplatLanes && VecT.isInteger()) { - // Otherwise, if this is an integer vector, pack the lane values together so - // we can construct the 128-bit constant from a pair of i64s using a splat - // followed by at most one i64x2.replace_lane. Also keep track of the lanes - // that actually matter so we can avoid the replace_lane in more cases. - std::array I64s{{0, 0}}; - std::array ConstLaneMasks{{0, 0}}; - size_t LaneBits = 128 / Lanes; - size_t HalfLanes = Lanes / 2; - for (size_t I = 0; I < Lanes; ++I) { - const SDValue &Lane = Op.getOperand(I); - if (IsConstant(Lane)) { - // How much we need to shift Val to position it in an i64 - auto Shift = LaneBits * (I % HalfLanes); - auto Mask = maskTrailingOnes(LaneBits); - auto Val = cast(Lane.getNode())->getZExtValue() & Mask; - I64s[I / HalfLanes] |= Val << Shift; - ConstLaneMasks[I / HalfLanes] |= Mask << Shift; - } - } - // Check whether all constant lanes in the second half of the vector are - // equivalent in the first half or vice versa to determine whether splatting - // either side will be sufficient to materialize the constant. As a special - // case, if the first and second halves have no constant lanes in common, we - // can just combine them. - bool FirstHalfSufficient = (I64s[0] & ConstLaneMasks[1]) == I64s[1]; - bool SecondHalfSufficient = (I64s[1] & ConstLaneMasks[0]) == I64s[0]; - bool CombinedSufficient = (ConstLaneMasks[0] & ConstLaneMasks[1]) == 0; - - uint64_t Splatted; - if (SecondHalfSufficient) { - Splatted = I64s[1]; - } else if (CombinedSufficient) { - Splatted = I64s[0] | I64s[1]; - } else { - Splatted = I64s[0]; - } - - Result = DAG.getSplatBuildVector(MVT::v2i64, DL, - DAG.getConstant(Splatted, DL, MVT::i64)); - if (!FirstHalfSufficient && !SecondHalfSufficient && !CombinedSufficient) { - Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2i64, Result, - DAG.getConstant(I64s[1], DL, MVT::i64), - DAG.getConstant(1, DL, MVT::i32)); - } - Result = DAG.getBitcast(VecT, Result); - IsLaneConstructed = [&IsConstant](size_t _, const SDValue &Lane) { - return IsConstant(Lane); - }; } else { // Use a splat, but possibly a load_splat LoadSDNode *SplattedLoad; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index a1173ce11647..f6b9efa85cb9 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -26,10 +26,6 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, AssemblerPredicate<(all_of FeatureSIMD128), "simd128">; -def HasUnimplementedSIMD128 : - Predicate<"Subtarget->hasUnimplementedSIMD128()">, - AssemblerPredicate<(all_of FeatureUnimplementedSIMD128), "unimplemented-simd128">; - def HasAtomics : Predicate<"Subtarget->hasAtomics()">, AssemblerPredicate<(all_of FeatureAtomics), "atomics">; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 98422a8264e7..d1f8cf4f5c15 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -404,8 +404,7 @@ defm : StoreLanePatNoOffset; // Constant: v128.const multiclass ConstVec { - let isMoveImm = 1, isReMaterializable = 1, - Predicates = [HasUnimplementedSIMD128] in + let isMoveImm = 1, isReMaterializable = 1 in defm CONST_V128_#vec : SIMD_I<(outs V128:$dst), ops, (outs), ops, [(set V128:$dst, (vec.vt pat))], "v128.const\t$dst, "#args, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h index a1c872ef2135..43d5871f0aa0 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h @@ -36,7 +36,6 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo { enum SIMDEnum { NoSIMD, SIMD128, - UnimplementedSIMD128, } SIMDLevel = NoSIMD; bool HasAtomics = false; @@ -90,9 +89,6 @@ public: // Predicates used by WebAssemblyInstrInfo.td. bool hasAddr64() const { return TargetTriple.isArch64Bit(); } bool hasSIMD128() const { return SIMDLevel >= SIMD128; } - bool hasUnimplementedSIMD128() const { - return SIMDLevel >= UnimplementedSIMD128; - } bool hasAtomics() const { return HasAtomics; } bool hasNontrappingFPToInt() const { return HasNontrappingFPToInt; } bool hasSignExt() const { return HasSignExt; } diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll index fca4710b582f..0268e8eb50c9 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -1,13 +1,14 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes CHECK,SIMD128,SIMD128-SLOW -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,SIMD128,SIMD128-FAST -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 -fast-isel | FileCheck %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128,SIMD128-SLOW + +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,SIMD128,SIMD128-FAST + ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefixes CHECK,NO-SIMD128 + ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel | FileCheck %s --check-prefixes CHECK,NO-SIMD128 ; check that a non-test run (including explicit locals pass) at least finishes -; RUN: llc < %s -O0 -mattr=+unimplemented-simd128 -; RUN: llc < %s -O2 -mattr=+unimplemented-simd128 +; RUN: llc < %s -O0 -mattr=+simd128 +; RUN: llc < %s -O2 -mattr=+simd128 ; Test that basic SIMD128 arithmetic operations assemble as expected. diff --git a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll index 1360e0172d3f..c1060ea1101f 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-build-vector.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes=CHECK,UNIMP -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefixes=CHECK,SIMD-VM +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s ; Test that the logic to choose between v128.const vector ; initialization and splat vector initialization and to optimize the @@ -8,95 +7,11 @@ target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" target triple = "wasm32-unknown-unknown" -; CHECK-LABEL: emulated_const_trivial_splat: -; CHECK-NEXT: .functype emulated_const_trivial_splat () -> (v128) -; SIMD-VM-NEXT: i64.const $push0=, 8589934593 -; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 -; SIMD-VM-NEXT: return $pop1 -; UNIMP: v128.const -define <4 x i32> @emulated_const_trivial_splat() { - ret <4 x i32> -} - -; CHECK-LABEL: emulated_const_first_sufficient: -; CHECK-NEXT: .functype emulated_const_first_sufficient () -> (v128) -; SIMD-VM-NEXT: i64.const $push0=, 8589934593 -; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 -; SIMD-VM-NEXT: return $pop1 -; UNIMP: v128.const -define <4 x i32> @emulated_const_first_sufficient() { - ret <4 x i32> -} - -; CHECK-LABEL: emulated_const_second_sufficient: -; CHECK-NEXT: .functype emulated_const_second_sufficient () -> (v128) -; SIMD-VM-NEXT: i64.const $push0=, 8589934593 -; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 -; SIMD-VM-NEXT: return $pop1 -; UNIMP: v128.const -define <4 x i32> @emulated_const_second_sufficient() { - ret <4 x i32> -} - -; CHECK-LABEL: emulated_const_combined_sufficient: -; CHECK-NEXT: .functype emulated_const_combined_sufficient () -> (v128) -; SIMD-VM-NEXT: i64.const $push0=, 8589934593 -; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 -; SIMD-VM-NEXT: return $pop1 -; UNIMP: v128.const -define <4 x i32> @emulated_const_combined_sufficient() { - ret <4 x i32> -} - -; CHECK-LABEL: emulated_const_either_sufficient: -; CHECK-NEXT: .functype emulated_const_either_sufficient () -> (v128) -; SIMD-VM-NEXT: i64.const $push0=, 1 -; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 -; SIMD-VM-NEXT: return $pop1 -; UNIMP: v128.const -define <4 x i32> @emulated_const_either_sufficient() { - ret <4 x i32> -} - -; CHECK-LABEL: emulated_const_neither_sufficient: -; CHECK-NEXT: .functype emulated_const_neither_sufficient () -> (v128) -; SIMD-VM-NEXT: i64.const $push0=, 8589934593 -; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 -; SIMD-VM-NEXT: i64.const $push2=, 17179869184 -; SIMD-VM-NEXT: i64x2.replace_lane $push3=, $pop1, 1, $pop2 -; SIMD-VM-NEXT: return $pop3 -define <4 x i32> @emulated_const_neither_sufficient() { - ret <4 x i32> -} - -; CHECK-LABEL: emulated_const_combined_sufficient_large: -; CHECK-NEXT: .functype emulated_const_combined_sufficient_large () -> (v128) -; SIMD-VM-NEXT: i64.const $push0=, 506097522914230528 -; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 -; SIMD-VM-NEXT: return $pop1 -define <16 x i8> @emulated_const_combined_sufficient_large() { - ret <16 x i8> -} - -; CHECK-LABEL: emulated_const_neither_sufficient_large: -; CHECK-NEXT: .functype emulated_const_neither_sufficient_large () -> (v128) -; SIMD-VM-NEXT: i64.const $push0=, -70368726997663744 -; SIMD-VM-NEXT: i64x2.splat $push1=, $pop0 -; SIMD-VM-NEXT: i64.const $push2=, 504408655873966336 -; SIMD-VM-NEXT: i64x2.replace_lane $push3=, $pop1, 1, $pop2 -; SIMD-VM-NEXT: return $pop3 -define <16 x i8> @emulated_const_neither_sufficient_large() { - ret <16 x i8> -} - ; CHECK-LABEL: same_const_one_replaced_i16x8: ; CHECK-NEXT: .functype same_const_one_replaced_i16x8 (i32) -> (v128) -; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 42, 42, 42, 42, 42, 0, 42, 42 -; UNIMP-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0 -; UNIMP-NEXT: return $pop[[L1]] -; SIMD-VM: i64x2.splat +; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 42, 42, 42, 42, 42, 0, 42, 42 +; CHECK-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0 +; CHECK-NEXT: return $pop[[L1]] define <8 x i16> @same_const_one_replaced_i16x8(i16 %x) { %v = insertelement <8 x i16> , @@ -107,10 +22,9 @@ define <8 x i16> @same_const_one_replaced_i16x8(i16 %x) { ; CHECK-LABEL: different_const_one_replaced_i16x8: ; CHECK-NEXT: .functype different_const_one_replaced_i16x8 (i32) -> (v128) -; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 1, -2, 3, -4, 5, 0, 7, -8 -; UNIMP-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0 -; UNIMP-NEXT: return $pop[[L1]] -; SIMD-VM: i64x2.splat +; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 1, -2, 3, -4, 5, 0, 7, -8 +; CHECK-NEXT: i16x8.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 5, $0 +; CHECK-NEXT: return $pop[[L1]] define <8 x i16> @different_const_one_replaced_i16x8(i16 %x) { %v = insertelement <8 x i16> , @@ -121,10 +35,9 @@ define <8 x i16> @different_const_one_replaced_i16x8(i16 %x) { ; CHECK-LABEL: same_const_one_replaced_f32x4: ; CHECK-NEXT: .functype same_const_one_replaced_f32x4 (f32) -> (v128) -; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.5p5, 0x1.5p5, 0x0p0, 0x1.5p5 -; UNIMP-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 2, $0 -; UNIMP-NEXT: return $pop[[L1]] -; SIMD-VM: f32x4.splat +; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1.5p5, 0x1.5p5, 0x0p0, 0x1.5p5 +; CHECK-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 2, $0 +; CHECK-NEXT: return $pop[[L1]] define <4 x float> @same_const_one_replaced_f32x4(float %x) { %v = insertelement <4 x float> , @@ -135,10 +48,9 @@ define <4 x float> @same_const_one_replaced_f32x4(float %x) { ; CHECK-LABEL: different_const_one_replaced_f32x4: ; CHECK-NEXT: .functype different_const_one_replaced_f32x4 (f32) -> (v128) -; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1p0, 0x1p1, 0x0p0, 0x1p2 -; UNIMP-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 2, $0 -; UNIMP-NEXT: return $pop[[L1]] -; SIMD-VM: f32x4.splat +; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x1p0, 0x1p1, 0x0p0, 0x1p2 +; CHECK-NEXT: f32x4.replace_lane $push[[L1:[0-9]+]]=, $pop[[L0]], 2, $0 +; CHECK-NEXT: return $pop[[L1]] define <4 x float> @different_const_one_replaced_f32x4(float %x) { %v = insertelement <4 x float> , @@ -149,9 +61,8 @@ define <4 x float> @different_const_one_replaced_f32x4(float %x) { ; CHECK-LABEL: splat_common_const_i32x4: ; CHECK-NEXT: .functype splat_common_const_i32x4 () -> (v128) -; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0, 3, 3, 1 -; UNIMP-NEXT: return $pop[[L0]] -; SIMD-VM: i64x2.splat +; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 0, 3, 3, 1 +; CHECK-NEXT: return $pop[[L0]] define <4 x i32> @splat_common_const_i32x4() { ret <4 x i32> } @@ -284,12 +195,11 @@ define <16 x i8> @mashup_swizzle_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %spla ; CHECK-LABEL: mashup_const_i8x16: ; CHECK-NEXT: .functype mashup_const_i8x16 (v128, v128, i32) -> (v128) -; UNIMP: v128.const $push[[L0:[0-9]+]]=, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0 -; UNIMP: i8x16.replace_lane -; UNIMP: i8x16.replace_lane -; UNIMP: i8x16.replace_lane -; UNIMP: return -; SIMD-VM: i64x2.splat +; CHECK: v128.const $push[[L0:[0-9]+]]=, 0, 0, 0, 0, 42, 0, 0, 0, 0, 0, 0, 0, 0, 0, 42, 0 +; CHECK: i8x16.replace_lane +; CHECK: i8x16.replace_lane +; CHECK: i8x16.replace_lane +; CHECK: return define <16 x i8> @mashup_const_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %splatted) { ; swizzle 0 %m0 = extractelement <16 x i8> %mask, i32 0 @@ -328,8 +238,8 @@ define <16 x i8> @mashup_splat_i8x16(<16 x i8> %src, <16 x i8> %mask, i8 %splatt ; CHECK-LABEL: undef_const_insert_f32x4: ; CHECK-NEXT: .functype undef_const_insert_f32x4 () -> (v128) -; UNIMP-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x0p0, 0x1.5p5, 0x0p0, 0x0p0 -; UNIMP-NEXT: return $pop[[L0]] +; CHECK-NEXT: v128.const $push[[L0:[0-9]+]]=, 0x0p0, 0x1.5p5, 0x0p0, 0x0p0 +; CHECK-NEXT: return $pop[[L0]] ; SIMD-VM: f32x4.splat define <4 x float> @undef_const_insert_f32x4() { %v = insertelement <4 x float> undef, float 42., i32 1 diff --git a/llvm/test/CodeGen/WebAssembly/simd-comparisons.ll b/llvm/test/CodeGen/WebAssembly/simd-comparisons.ll index a77f9e1fa581..475bfc5110fe 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-comparisons.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-comparisons.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes CHECK,SIMD128 -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+simd128 | FileCheck %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128 ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals | FileCheck %s --check-prefixes CHECK,NO-SIMD128 ; Test SIMD comparison operators diff --git a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll index 53731b0f7c16..36856336e65e 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-conversions.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-conversions.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes CHECK,SIMD128 -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128-VM +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128 ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals | FileCheck %s --check-prefixes CHECK,NO-SIMD128 ; Test that vector float-to-int and int-to-float instructions lower correctly @@ -29,7 +28,7 @@ define <4 x float> @convert_u_v4f32(<4 x i32> %x) { ; CHECK-LABEL: convert_s_v2f64: ; NO-SIMD128-NOT: i64x2 -; SIMD128-VM-NOT: f64x2.convert_i64x2_s +; SIMD128-NOT: f64x2.convert_i64x2_s ; SIMD128-NEXT: .functype convert_s_v2f64 (v128) -> (v128){{$}} define <2 x double> @convert_s_v2f64(<2 x i64> %x) { %a = sitofp <2 x i64> %x to <2 x double> @@ -38,7 +37,7 @@ define <2 x double> @convert_s_v2f64(<2 x i64> %x) { ; CHECK-LABEL: convert_u_v2f64: ; NO-SIMD128-NOT: i64x2 -; SIMD128-VM-NOT: f64x2.convert_i64x2_u +; SIMD128-NOT: f64x2.convert_i64x2_u ; SIMD128-NEXT: .functype convert_u_v2f64 (v128) -> (v128){{$}} define <2 x double> @convert_u_v2f64(<2 x i64> %x) { %a = uitofp <2 x i64> %x to <2 x double> @@ -67,7 +66,7 @@ define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) { ; CHECK-LABEL: trunc_sat_s_v2i64: ; NO-SIMD128-NOT: f64x2 -; SIMD128-VM-NOT: i64x2.trunc_sat_f64x2_s +; SIMD128-NOT: i64x2.trunc_sat_f64x2_s ; SIMD128-NEXT: .functype trunc_sat_s_v2i64 (v128) -> (v128){{$}} define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) { %a = fptosi <2 x double> %x to <2 x i64> @@ -76,7 +75,7 @@ define <2 x i64> @trunc_sat_s_v2i64(<2 x double> %x) { ; CHECK-LABEL: trunc_sat_u_v2i64: ; NO-SIMD128-NOT: f64x2 -; SIMD128-VM-NOT: i64x2.trunc_sat_f64x2_u +; SIMD128-NOT: i64x2.trunc_sat_f64x2_u ; SIMD128-NEXT: .functype trunc_sat_u_v2i64 (v128) -> (v128){{$}} define <2 x i64> @trunc_sat_u_v2i64(<2 x double> %x) { %a = fptoui <2 x double> %x to <2 x i64> diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll index f223615c57a1..a3b0d50903f6 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes CHECK,SIMD128 -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,SIMD128 +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 -fast-isel | FileCheck %s ; Test that SIMD128 intrinsics lower as expected. These intrinsics are ; only expected to lower successfully if the simd128 attribute is @@ -12,9 +12,9 @@ target triple = "wasm32-unknown-unknown" ; 16 x i8 ; ============================================================================== ; CHECK-LABEL: swizzle_v16i8: -; SIMD128-NEXT: .functype swizzle_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.swizzle $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype swizzle_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.swizzle $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.swizzle(<16 x i8>, <16 x i8>) define <16 x i8> @swizzle_v16i8(<16 x i8> %x, <16 x i8> %y) { %a = call <16 x i8> @llvm.wasm.swizzle(<16 x i8> %x, <16 x i8> %y) @@ -22,9 +22,9 @@ define <16 x i8> @swizzle_v16i8(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: add_sat_s_v16i8: -; SIMD128-NEXT: .functype add_sat_s_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype add_sat_s_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) { %a = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) @@ -32,9 +32,9 @@ define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: add_sat_u_v16i8: -; SIMD128-NEXT: .functype add_sat_u_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype add_sat_u_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @add_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) { %a = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> %x, <16 x i8> %y) @@ -42,9 +42,9 @@ define <16 x i8> @add_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: sub_sat_s_v16i8: -; SIMD128-NEXT: .functype sub_sat_s_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.sub_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype sub_sat_s_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.sub_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.sub.saturate.signed.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @sub_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) { %a = call <16 x i8> @llvm.wasm.sub.saturate.signed.v16i8( @@ -54,9 +54,9 @@ define <16 x i8> @sub_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: sub_sat_u_v16i8: -; SIMD128-NEXT: .functype sub_sat_u_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.sub_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype sub_sat_u_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.sub_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.sub.saturate.unsigned.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @sub_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) { %a = call <16 x i8> @llvm.wasm.sub.saturate.unsigned.v16i8( @@ -66,9 +66,9 @@ define <16 x i8> @sub_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: avgr_u_v16i8: -; SIMD128-NEXT: .functype avgr_u_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.avgr_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype avgr_u_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.avgr_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) { %a = call <16 x i8> @llvm.wasm.avgr.unsigned.v16i8(<16 x i8> %x, <16 x i8> %y) @@ -76,9 +76,9 @@ define <16 x i8> @avgr_u_v16i8(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: popcnt_v16i8: -; SIMD128-NEXT: .functype popcnt_v16i8 (v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.popcnt $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype popcnt_v16i8 (v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.popcnt $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.popcnt(<16 x i8>) define <16 x i8> @popcnt_v16i8(<16 x i8> %x) { %a = call <16 x i8> @llvm.wasm.popcnt(<16 x i8> %x) @@ -86,9 +86,9 @@ define <16 x i8> @popcnt_v16i8(<16 x i8> %x) { } ; CHECK-LABEL: any_v16i8: -; SIMD128-NEXT: .functype any_v16i8 (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v16i8 (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.anytrue.v16i8(<16 x i8>) define i32 @any_v16i8(<16 x i8> %x) { %a = call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> %x) @@ -96,9 +96,9 @@ define i32 @any_v16i8(<16 x i8> %x) { } ; CHECK-LABEL: all_v16i8: -; SIMD128-NEXT: .functype all_v16i8 (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v16i8 (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.alltrue.v16i8(<16 x i8>) define i32 @all_v16i8(<16 x i8> %x) { %a = call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> %x) @@ -106,9 +106,9 @@ define i32 @all_v16i8(<16 x i8> %x) { } ; CHECK-LABEL: bitmask_v16i8: -; SIMD128-NEXT: .functype bitmask_v16i8 (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.bitmask $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitmask_v16i8 (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.bitmask $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.bitmask.v16i8(<16 x i8>) define i32 @bitmask_v16i8(<16 x i8> %x) { %a = call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> %x) @@ -116,9 +116,9 @@ define i32 @bitmask_v16i8(<16 x i8> %x) { } ; CHECK-LABEL: bitselect_v16i8: -; SIMD128-NEXT: .functype bitselect_v16i8 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitselect_v16i8 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.bitselect.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) define <16 x i8> @bitselect_v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %c) { %a = call <16 x i8> @llvm.wasm.bitselect.v16i8( @@ -128,9 +128,9 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %c) { } ; CHECK-LABEL: signselect_v16i8: -; SIMD128-NEXT: .functype signselect_v16i8 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype signselect_v16i8 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.signselect.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) define <16 x i8> @signselect_v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %c) { %a = call <16 x i8> @llvm.wasm.signselect.v16i8( @@ -140,9 +140,9 @@ define <16 x i8> @signselect_v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %c) { } ; CHECK-LABEL: narrow_signed_v16i8: -; SIMD128-NEXT: .functype narrow_signed_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.narrow_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype narrow_signed_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.narrow_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16(<8 x i16>, <8 x i16>) define <16 x i8> @narrow_signed_v16i8(<8 x i16> %low, <8 x i16> %high) { %a = call <16 x i8> @llvm.wasm.narrow.signed.v16i8.v8i16( @@ -152,9 +152,9 @@ define <16 x i8> @narrow_signed_v16i8(<8 x i16> %low, <8 x i16> %high) { } ; CHECK-LABEL: narrow_unsigned_v16i8: -; SIMD128-NEXT: .functype narrow_unsigned_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.narrow_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype narrow_unsigned_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.narrow_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16(<8 x i16>, <8 x i16>) define <16 x i8> @narrow_unsigned_v16i8(<8 x i16> %low, <8 x i16> %high) { %a = call <16 x i8> @llvm.wasm.narrow.unsigned.v16i8.v8i16( @@ -164,11 +164,11 @@ define <16 x i8> @narrow_unsigned_v16i8(<8 x i16> %low, <8 x i16> %high) { } ; CHECK-LABEL: shuffle_v16i8: -; NO-SIMD128-NOT: i8x16 -; SIMD128-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1, -; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; NO-CHECK-NOT: i8x16 +; CHECK-NEXT: .functype shuffle_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1, +; CHECK-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.wasm.shuffle( <16 x i8>, <16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) @@ -180,11 +180,11 @@ define <16 x i8> @shuffle_v16i8(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: shuffle_undef_v16i8: -; NO-SIMD128-NOT: i8x16 -; SIMD128-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1, -; SIMD128-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; NO-CHECK-NOT: i8x16 +; CHECK-NEXT: .functype shuffle_undef_v16i8 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i8x16.shuffle $push[[R:[0-9]+]]=, $0, $1, +; CHECK-SAME: 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) { %res = call <16 x i8> @llvm.wasm.shuffle(<16 x i8> %x, <16 x i8> %y, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, @@ -197,9 +197,9 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) { ; 8 x i16 ; ============================================================================== ; CHECK-LABEL: add_sat_s_v8i16: -; SIMD128-NEXT: .functype add_sat_s_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype add_sat_s_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { %a = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) @@ -207,9 +207,9 @@ define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: add_sat_u_v8i16: -; SIMD128-NEXT: .functype add_sat_u_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype add_sat_u_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @add_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) { %a = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> %x, <8 x i16> %y) @@ -217,9 +217,9 @@ define <8 x i16> @add_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: sub_sat_s_v8i16: -; SIMD128-NEXT: .functype sub_sat_s_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.sub_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype sub_sat_s_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.sub_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.sub.saturate.signed.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @sub_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { %a = call <8 x i16> @llvm.wasm.sub.saturate.signed.v8i16( @@ -229,9 +229,9 @@ define <8 x i16> @sub_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: sub_sat_u_v8i16: -; SIMD128-NEXT: .functype sub_sat_u_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.sub_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype sub_sat_u_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.sub_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.sub.saturate.unsigned.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @sub_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) { %a = call <8 x i16> @llvm.wasm.sub.saturate.unsigned.v8i16( @@ -241,9 +241,9 @@ define <8 x i16> @sub_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: avgr_u_v8i16: -; SIMD128-NEXT: .functype avgr_u_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.avgr_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype avgr_u_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.avgr_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) { %a = call <8 x i16> @llvm.wasm.avgr.unsigned.v8i16(<8 x i16> %x, <8 x i16> %y) @@ -251,9 +251,9 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: q15mulr_sat_s_v8i16: -; SIMD128-NEXT: .functype q15mulr_sat_s_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.q15mulr_sat_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype q15mulr_sat_s_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.q15mulr_sat_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.q15mulr.saturate.signed(<8 x i16>, <8 x i16>) define <8 x i16> @q15mulr_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { %a = call <8 x i16> @llvm.wasm.q15mulr.saturate.signed(<8 x i16> %x, @@ -262,9 +262,9 @@ define <8 x i16> @q15mulr_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: extmul_low_s_v8i16: -; SIMD128-NEXT: .functype extmul_low_s_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.extmul_low_i8x16_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_low_s_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.extmul_low_i8x16_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.extmul.low.signed.v8i16(<16 x i8>, <16 x i8>) define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %x, <16 x i8> %y) { %a = call <8 x i16> @llvm.wasm.extmul.low.signed.v8i16( @@ -274,9 +274,9 @@ define <8 x i16> @extmul_low_s_v8i16(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: extmul_high_s_v8i16: -; SIMD128-NEXT: .functype extmul_high_s_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.extmul_high_i8x16_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_high_s_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.extmul_high_i8x16_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.extmul.high.signed.v8i16(<16 x i8>, <16 x i8>) define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %x, <16 x i8> %y) { %a = call <8 x i16> @llvm.wasm.extmul.high.signed.v8i16( @@ -286,9 +286,9 @@ define <8 x i16> @extmul_high_s_v8i16(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: extmul_low_u_v8i16: -; SIMD128-NEXT: .functype extmul_low_u_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.extmul_low_i8x16_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_low_u_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.extmul_low_i8x16_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16(<16 x i8>, <16 x i8>) define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %x, <16 x i8> %y) { %a = call <8 x i16> @llvm.wasm.extmul.low.unsigned.v8i16( @@ -298,9 +298,9 @@ define <8 x i16> @extmul_low_u_v8i16(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: extmul_high_u_v8i16: -; SIMD128-NEXT: .functype extmul_high_u_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.extmul_high_i8x16_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_high_u_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.extmul_high_i8x16_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16(<16 x i8>, <16 x i8>) define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %x, <16 x i8> %y) { %a = call <8 x i16> @llvm.wasm.extmul.high.unsigned.v8i16( @@ -310,9 +310,9 @@ define <8 x i16> @extmul_high_u_v8i16(<16 x i8> %x, <16 x i8> %y) { } ; CHECK-LABEL: extadd_pairwise_s_v8i16: -; SIMD128-NEXT: .functype extadd_pairwise_s_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.extadd_pairwise_i8x16_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extadd_pairwise_s_v8i16 (v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.extadd_pairwise_i8x16_s $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8>) define <8 x i16> @extadd_pairwise_s_v8i16(<16 x i8> %x) { %a = call <8 x i16> @llvm.wasm.extadd.pairwise.signed.v8i16(<16 x i8> %x) @@ -320,9 +320,9 @@ define <8 x i16> @extadd_pairwise_s_v8i16(<16 x i8> %x) { } ; CHECK-LABEL: extadd_pairwise_u_v8i16: -; SIMD128-NEXT: .functype extadd_pairwise_u_v8i16 (v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.extadd_pairwise_i8x16_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extadd_pairwise_u_v8i16 (v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.extadd_pairwise_i8x16_u $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8>) define <8 x i16> @extadd_pairwise_u_v8i16(<16 x i8> %x) { %a = call <8 x i16> @llvm.wasm.extadd.pairwise.unsigned.v8i16(<16 x i8> %x) @@ -330,9 +330,9 @@ define <8 x i16> @extadd_pairwise_u_v8i16(<16 x i8> %x) { } ; CHECK-LABEL: any_v8i16: -; SIMD128-NEXT: .functype any_v8i16 (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v8i16 (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.anytrue.v8i16(<8 x i16>) define i32 @any_v8i16(<8 x i16> %x) { %a = call i32 @llvm.wasm.anytrue.v8i16(<8 x i16> %x) @@ -340,9 +340,9 @@ define i32 @any_v8i16(<8 x i16> %x) { } ; CHECK-LABEL: all_v8i16: -; SIMD128-NEXT: .functype all_v8i16 (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v8i16 (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.alltrue.v8i16(<8 x i16>) define i32 @all_v8i16(<8 x i16> %x) { %a = call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> %x) @@ -350,9 +350,9 @@ define i32 @all_v8i16(<8 x i16> %x) { } ; CHECK-LABEL: bitmask_v8i16: -; SIMD128-NEXT: .functype bitmask_v8i16 (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.bitmask $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitmask_v8i16 (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.bitmask $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.bitmask.v8i16(<8 x i16>) define i32 @bitmask_v8i16(<8 x i16> %x) { %a = call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> %x) @@ -360,9 +360,9 @@ define i32 @bitmask_v8i16(<8 x i16> %x) { } ; CHECK-LABEL: bitselect_v8i16: -; SIMD128-NEXT: .functype bitselect_v8i16 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitselect_v8i16 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.bitselect.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) define <8 x i16> @bitselect_v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %c) { %a = call <8 x i16> @llvm.wasm.bitselect.v8i16( @@ -372,9 +372,9 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %c) { } ; CHECK-LABEL: signselect_v8i16: -; SIMD128-NEXT: .functype signselect_v8i16 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype signselect_v8i16 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.signselect.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) define <8 x i16> @signselect_v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %c) { %a = call <8 x i16> @llvm.wasm.signselect.v8i16( @@ -384,9 +384,9 @@ define <8 x i16> @signselect_v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %c) { } ; CHECK-LABEL: narrow_signed_v8i16: -; SIMD128-NEXT: .functype narrow_signed_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.narrow_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype narrow_signed_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.narrow_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32(<4 x i32>, <4 x i32>) define <8 x i16> @narrow_signed_v8i16(<4 x i32> %low, <4 x i32> %high) { %a = call <8 x i16> @llvm.wasm.narrow.signed.v8i16.v4i32( @@ -396,9 +396,9 @@ define <8 x i16> @narrow_signed_v8i16(<4 x i32> %low, <4 x i32> %high) { } ; CHECK-LABEL: narrow_unsigned_v8i16: -; SIMD128-NEXT: .functype narrow_unsigned_v8i16 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i16x8.narrow_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype narrow_unsigned_v8i16 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i16x8.narrow_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32(<4 x i32>, <4 x i32>) define <8 x i16> @narrow_unsigned_v8i16(<4 x i32> %low, <4 x i32> %high) { %a = call <8 x i16> @llvm.wasm.narrow.unsigned.v8i16.v4i32( @@ -411,9 +411,9 @@ define <8 x i16> @narrow_unsigned_v8i16(<4 x i32> %low, <4 x i32> %high) { ; 4 x i32 ; ============================================================================== ; CHECK-LABEL: dot: -; SIMD128-NEXT: .functype dot (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.dot_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype dot (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.dot_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.dot(<8 x i16>, <8 x i16>) define <4 x i32> @dot(<8 x i16> %x, <8 x i16> %y) { %a = call <4 x i32> @llvm.wasm.dot(<8 x i16> %x, <8 x i16> %y) @@ -422,9 +422,9 @@ define <4 x i32> @dot(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: extmul_low_s_v4i32: -; SIMD128-NEXT: .functype extmul_low_s_v4i32 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.extmul_low_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_low_s_v4i32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.extmul_low_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.extmul.low.signed.v4i32(<8 x i16>, <8 x i16>) define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %x, <8 x i16> %y) { %a = call <4 x i32> @llvm.wasm.extmul.low.signed.v4i32( @@ -434,9 +434,9 @@ define <4 x i32> @extmul_low_s_v4i32(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: extmul_high_s_v4i32: -; SIMD128-NEXT: .functype extmul_high_s_v4i32 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.extmul_high_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_high_s_v4i32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.extmul_high_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.extmul.high.signed.v4i32(<8 x i16>, <8 x i16>) define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %x, <8 x i16> %y) { %a = call <4 x i32> @llvm.wasm.extmul.high.signed.v4i32( @@ -446,9 +446,9 @@ define <4 x i32> @extmul_high_s_v4i32(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: extmul_low_u_v4i32: -; SIMD128-NEXT: .functype extmul_low_u_v4i32 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.extmul_low_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_low_u_v4i32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.extmul_low_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32(<8 x i16>, <8 x i16>) define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %x, <8 x i16> %y) { %a = call <4 x i32> @llvm.wasm.extmul.low.unsigned.v4i32( @@ -458,9 +458,9 @@ define <4 x i32> @extmul_low_u_v4i32(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: extmul_high_u_v4i32: -; SIMD128-NEXT: .functype extmul_high_u_v4i32 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.extmul_high_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_high_u_v4i32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.extmul_high_i16x8_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32(<8 x i16>, <8 x i16>) define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %x, <8 x i16> %y) { %a = call <4 x i32> @llvm.wasm.extmul.high.unsigned.v4i32( @@ -470,9 +470,9 @@ define <4 x i32> @extmul_high_u_v4i32(<8 x i16> %x, <8 x i16> %y) { } ; CHECK-LABEL: extadd_pairwise_s_v4i32: -; SIMD128-NEXT: .functype extadd_pairwise_s_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.extadd_pairwise_i16x8_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extadd_pairwise_s_v4i32 (v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.extadd_pairwise_i16x8_s $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16>) define <4 x i32> @extadd_pairwise_s_v4i32(<8 x i16> %x) { %a = call <4 x i32> @llvm.wasm.extadd.pairwise.signed.v4i32(<8 x i16> %x) @@ -480,9 +480,9 @@ define <4 x i32> @extadd_pairwise_s_v4i32(<8 x i16> %x) { } ; CHECK-LABEL: extadd_pairwise_u_v4i32: -; SIMD128-NEXT: .functype extadd_pairwise_u_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.extadd_pairwise_i16x8_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extadd_pairwise_u_v4i32 (v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.extadd_pairwise_i16x8_u $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16>) define <4 x i32> @extadd_pairwise_u_v4i32(<8 x i16> %x) { %a = call <4 x i32> @llvm.wasm.extadd.pairwise.unsigned.v4i32(<8 x i16> %x) @@ -491,9 +491,9 @@ define <4 x i32> @extadd_pairwise_u_v4i32(<8 x i16> %x) { ; CHECK-LABEL: any_v4i32: -; SIMD128-NEXT: .functype any_v4i32 (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v4i32 (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.anytrue.v4i32(<4 x i32>) define i32 @any_v4i32(<4 x i32> %x) { %a = call i32 @llvm.wasm.anytrue.v4i32(<4 x i32> %x) @@ -501,9 +501,9 @@ define i32 @any_v4i32(<4 x i32> %x) { } ; CHECK-LABEL: all_v4i32: -; SIMD128-NEXT: .functype all_v4i32 (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v4i32 (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.alltrue.v4i32(<4 x i32>) define i32 @all_v4i32(<4 x i32> %x) { %a = call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> %x) @@ -511,9 +511,9 @@ define i32 @all_v4i32(<4 x i32> %x) { } ; CHECK-LABEL: bitmask_v4i32: -; SIMD128-NEXT: .functype bitmask_v4i32 (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.bitmask $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitmask_v4i32 (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.bitmask $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.bitmask.v4i32(<4 x i32>) define i32 @bitmask_v4i32(<4 x i32> %x) { %a = call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> %x) @@ -521,9 +521,9 @@ define i32 @bitmask_v4i32(<4 x i32> %x) { } ; CHECK-LABEL: bitselect_v4i32: -; SIMD128-NEXT: .functype bitselect_v4i32 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitselect_v4i32 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.bitselect.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) define <4 x i32> @bitselect_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %c) { %a = call <4 x i32> @llvm.wasm.bitselect.v4i32( @@ -533,9 +533,9 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %c) { } ; CHECK-LABEL: signselect_v4i32: -; SIMD128-NEXT: .functype signselect_v4i32 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype signselect_v4i32 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.signselect.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) define <4 x i32> @signselect_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %c) { %a = call <4 x i32> @llvm.wasm.signselect.v4i32( @@ -545,10 +545,10 @@ define <4 x i32> @signselect_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %c) { } ; CHECK-LABEL: trunc_sat_s_v4i32: -; NO-SIMD128-NOT: f32x4 -; SIMD128-NEXT: .functype trunc_sat_s_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.trunc_sat_f32x4_s $push[[R:[0-9]+]]=, $0 -; SIMD128-NEXT: return $pop[[R]] +; NO-CHECK-NOT: f32x4 +; CHECK-NEXT: .functype trunc_sat_s_v4i32 (v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.trunc_sat_f32x4_s $push[[R:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[R]] declare <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float>) define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) { %a = call <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float> %x) @@ -556,10 +556,10 @@ define <4 x i32> @trunc_sat_s_v4i32(<4 x float> %x) { } ; CHECK-LABEL: trunc_sat_u_v4i32: -; NO-SIMD128-NOT: f32x4 -; SIMD128-NEXT: .functype trunc_sat_u_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.trunc_sat_f32x4_u $push[[R:[0-9]+]]=, $0 -; SIMD128-NEXT: return $pop[[R]] +; NO-CHECK-NOT: f32x4 +; CHECK-NEXT: .functype trunc_sat_u_v4i32 (v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.trunc_sat_f32x4_u $push[[R:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[R]] declare <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float>) define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) { %a = call <4 x i32> @llvm.wasm.trunc.saturate.unsigned.v4i32.v4f32(<4 x float> %x) @@ -567,9 +567,9 @@ define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) { } ; CHECK-LABEL: trunc_sat_zero_signed_v4i32: -; SIMD128-NEXT: .functype trunc_sat_zero_signed_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.trunc_sat_zero_f64x2_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype trunc_sat_zero_signed_v4i32 (v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.trunc_sat_zero_f64x2_s $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.trunc.saturate.zero.signed(<2 x double>) define <4 x i32> @trunc_sat_zero_signed_v4i32(<2 x double> %a) { %v = call <4 x i32> @llvm.wasm.trunc.saturate.zero.signed(<2 x double> %a) @@ -577,9 +577,9 @@ define <4 x i32> @trunc_sat_zero_signed_v4i32(<2 x double> %a) { } ; CHECK-LABEL: trunc_sat_zero_unsigned_v4i32: -; SIMD128-NEXT: .functype trunc_sat_zero_unsigned_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.trunc_sat_zero_f64x2_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype trunc_sat_zero_unsigned_v4i32 (v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.trunc_sat_zero_f64x2_u $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.trunc.saturate.zero.unsigned(<2 x double>) define <4 x i32> @trunc_sat_zero_unsigned_v4i32(<2 x double> %a) { %v = call <4 x i32> @llvm.wasm.trunc.saturate.zero.unsigned(<2 x double> %a) @@ -588,9 +588,9 @@ define <4 x i32> @trunc_sat_zero_unsigned_v4i32(<2 x double> %a) { ; CHECK-LABEL: widen_signed_v4i32: -; SIMD128-NEXT: .functype widen_signed_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_i8x16_s $push[[R:[0-9]+]]=, $0, 1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype widen_signed_v4i32 (v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.widen_i8x16_s $push[[R:[0-9]+]]=, $0, 1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.widen.signed(<16 x i8>, i32 immarg) define <4 x i32> @widen_signed_v4i32(<16 x i8> %x) { %v = call <4 x i32> @llvm.wasm.widen.signed(<16 x i8> %x, i32 1) @@ -598,9 +598,9 @@ define <4 x i32> @widen_signed_v4i32(<16 x i8> %x) { } ; CHECK-LABEL: widen_unsigned_v4i32: -; SIMD128-NEXT: .functype widen_unsigned_v4i32 (v128) -> (v128){{$}} -; SIMD128-NEXT: i32x4.widen_i8x16_u $push[[R:[0-9]+]]=, $0, 1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype widen_unsigned_v4i32 (v128) -> (v128){{$}} +; CHECK-NEXT: i32x4.widen_i8x16_u $push[[R:[0-9]+]]=, $0, 1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x i32> @llvm.wasm.widen.unsigned(<16 x i8>, i32 immarg) define <4 x i32> @widen_unsigned_v4i32(<16 x i8> %x) { %v = call <4 x i32> @llvm.wasm.widen.unsigned(<16 x i8> %x, i32 1) @@ -611,9 +611,9 @@ define <4 x i32> @widen_unsigned_v4i32(<16 x i8> %x) { ; 2 x i64 ; ============================================================================== ; CHECK-LABEL: eq_v2i64: -; SIMD128-NEXT: .functype eq_v2i64 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.eq $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype eq_v2i64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.eq $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.eq(<2 x i64>, <2 x i64>) define <2 x i64> @eq_v2i64(<2 x i64> %x, <2 x i64> %y) { %a = call <2 x i64> @llvm.wasm.eq(<2 x i64> %x, <2 x i64> %y) @@ -621,9 +621,9 @@ define <2 x i64> @eq_v2i64(<2 x i64> %x, <2 x i64> %y) { } ; CHECK-LABEL: widen_low_s_v2i64: -; SIMD128-NEXT: .functype widen_low_s_v2i64 (v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.widen_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype widen_low_s_v2i64 (v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.widen_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.widen.low.signed(<4 x i32>) define <2 x i64> @widen_low_s_v2i64(<4 x i32> %x) { %a = call <2 x i64> @llvm.wasm.widen.low.signed(<4 x i32> %x) @@ -631,9 +631,9 @@ define <2 x i64> @widen_low_s_v2i64(<4 x i32> %x) { } ; CHECK-LABEL: widen_high_s_v2i64: -; SIMD128-NEXT: .functype widen_high_s_v2i64 (v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.widen_high_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype widen_high_s_v2i64 (v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.widen_high_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.widen.high.signed(<4 x i32>) define <2 x i64> @widen_high_s_v2i64(<4 x i32> %x) { %a = call <2 x i64> @llvm.wasm.widen.high.signed(<4 x i32> %x) @@ -641,9 +641,9 @@ define <2 x i64> @widen_high_s_v2i64(<4 x i32> %x) { } ; CHECK-LABEL: widen_low_u_v2i64: -; SIMD128-NEXT: .functype widen_low_u_v2i64 (v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.widen_low_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype widen_low_u_v2i64 (v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.widen_low_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.widen.low.unsigned(<4 x i32>) define <2 x i64> @widen_low_u_v2i64(<4 x i32> %x) { %a = call <2 x i64> @llvm.wasm.widen.low.unsigned(<4 x i32> %x) @@ -651,9 +651,9 @@ define <2 x i64> @widen_low_u_v2i64(<4 x i32> %x) { } ; CHECK-LABEL: widen_high_u_v2i64: -; SIMD128-NEXT: .functype widen_high_u_v2i64 (v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.widen_high_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype widen_high_u_v2i64 (v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.widen_high_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.widen.high.unsigned(<4 x i32>) define <2 x i64> @widen_high_u_v2i64(<4 x i32> %x) { %a = call <2 x i64> @llvm.wasm.widen.high.unsigned(<4 x i32> %x) @@ -661,9 +661,9 @@ define <2 x i64> @widen_high_u_v2i64(<4 x i32> %x) { } ; CHECK-LABEL: extmul_low_s_v2i64: -; SIMD128-NEXT: .functype extmul_low_s_v2i64 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.extmul_low_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_low_s_v2i64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.extmul_low_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.extmul.low.signed.v2i64(<4 x i32>, <4 x i32>) define <2 x i64> @extmul_low_s_v2i64(<4 x i32> %x, <4 x i32> %y) { %a = call <2 x i64> @llvm.wasm.extmul.low.signed.v2i64( @@ -673,9 +673,9 @@ define <2 x i64> @extmul_low_s_v2i64(<4 x i32> %x, <4 x i32> %y) { } ; CHECK-LABEL: extmul_high_s_v2i64: -; SIMD128-NEXT: .functype extmul_high_s_v2i64 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.extmul_high_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_high_s_v2i64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.extmul_high_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.extmul.high.signed.v2i64(<4 x i32>, <4 x i32>) define <2 x i64> @extmul_high_s_v2i64(<4 x i32> %x, <4 x i32> %y) { %a = call <2 x i64> @llvm.wasm.extmul.high.signed.v2i64( @@ -685,9 +685,9 @@ define <2 x i64> @extmul_high_s_v2i64(<4 x i32> %x, <4 x i32> %y) { } ; CHECK-LABEL: extmul_low_u_v2i64: -; SIMD128-NEXT: .functype extmul_low_u_v2i64 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.extmul_low_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_low_u_v2i64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.extmul_low_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64(<4 x i32>, <4 x i32>) define <2 x i64> @extmul_low_u_v2i64(<4 x i32> %x, <4 x i32> %y) { %a = call <2 x i64> @llvm.wasm.extmul.low.unsigned.v2i64( @@ -697,9 +697,9 @@ define <2 x i64> @extmul_low_u_v2i64(<4 x i32> %x, <4 x i32> %y) { } ; CHECK-LABEL: extmul_high_u_v2i64: -; SIMD128-NEXT: .functype extmul_high_u_v2i64 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.extmul_high_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype extmul_high_u_v2i64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.extmul_high_i32x4_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64(<4 x i32>, <4 x i32>) define <2 x i64> @extmul_high_u_v2i64(<4 x i32> %x, <4 x i32> %y) { %a = call <2 x i64> @llvm.wasm.extmul.high.unsigned.v2i64( @@ -709,9 +709,9 @@ define <2 x i64> @extmul_high_u_v2i64(<4 x i32> %x, <4 x i32> %y) { } ; CHECK-LABEL: any_v2i64: -; SIMD128-NEXT: .functype any_v2i64 (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v2i64 (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.anytrue.v2i64(<2 x i64>) define i32 @any_v2i64(<2 x i64> %x) { %a = call i32 @llvm.wasm.anytrue.v2i64(<2 x i64> %x) @@ -719,9 +719,9 @@ define i32 @any_v2i64(<2 x i64> %x) { } ; CHECK-LABEL: all_v2i64: -; SIMD128-NEXT: .functype all_v2i64 (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v2i64 (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.alltrue.v2i64(<2 x i64>) define i32 @all_v2i64(<2 x i64> %x) { %a = call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> %x) @@ -729,9 +729,9 @@ define i32 @all_v2i64(<2 x i64> %x) { } ; CHECK-LABEL: bitmask_v2i64: -; SIMD128-NEXT: .functype bitmask_v2i64 (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.bitmask $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitmask_v2i64 (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.bitmask $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare i32 @llvm.wasm.bitmask.v2i64(<2 x i64>) define i32 @bitmask_v2i64(<2 x i64> %x) { %a = call i32 @llvm.wasm.bitmask.v2i64(<2 x i64> %x) @@ -739,9 +739,9 @@ define i32 @bitmask_v2i64(<2 x i64> %x) { } ; CHECK-LABEL: bitselect_v2i64: -; SIMD128-NEXT: .functype bitselect_v2i64 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitselect_v2i64 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.bitselect.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) define <2 x i64> @bitselect_v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %c) { %a = call <2 x i64> @llvm.wasm.bitselect.v2i64( @@ -751,9 +751,9 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %c) { } ; CHECK-LABEL: signselect_v2i64: -; SIMD128-NEXT: .functype signselect_v2i64 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: i64x2.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype signselect_v2i64 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x i64> @llvm.wasm.signselect.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) define <2 x i64> @signselect_v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %c) { %a = call <2 x i64> @llvm.wasm.signselect.v2i64( @@ -766,9 +766,9 @@ define <2 x i64> @signselect_v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %c) { ; 4 x f32 ; ============================================================================== ; CHECK-LABEL: bitselect_v4f32: -; SIMD128-NEXT: .functype bitselect_v4f32 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitselect_v4f32 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.bitselect.v4f32(<4 x float>, <4 x float>, <4 x float>) define <4 x float> @bitselect_v4f32(<4 x float> %v1, <4 x float> %v2, <4 x float> %c) { %a = call <4 x float> @llvm.wasm.bitselect.v4f32( @@ -778,9 +778,9 @@ define <4 x float> @bitselect_v4f32(<4 x float> %v1, <4 x float> %v2, <4 x float } ; CHECK-LABEL: pmin_v4f32: -; SIMD128-NEXT: .functype pmin_v4f32 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype pmin_v4f32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.pmin $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.pmin.v4f32(<4 x float>, <4 x float>) define <4 x float> @pmin_v4f32(<4 x float> %a, <4 x float> %b) { %v = call <4 x float> @llvm.wasm.pmin.v4f32(<4 x float> %a, <4 x float> %b) @@ -788,9 +788,9 @@ define <4 x float> @pmin_v4f32(<4 x float> %a, <4 x float> %b) { } ; CHECK-LABEL: pmax_v4f32: -; SIMD128-NEXT: .functype pmax_v4f32 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype pmax_v4f32 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.pmax $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.pmax.v4f32(<4 x float>, <4 x float>) define <4 x float> @pmax_v4f32(<4 x float> %a, <4 x float> %b) { %v = call <4 x float> @llvm.wasm.pmax.v4f32(<4 x float> %a, <4 x float> %b) @@ -798,9 +798,9 @@ define <4 x float> @pmax_v4f32(<4 x float> %a, <4 x float> %b) { } ; CHECK-LABEL: ceil_v4f32: -; SIMD128-NEXT: .functype ceil_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.ceil $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype ceil_v4f32 (v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.ceil $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.ceil.v4f32(<4 x float>) define <4 x float> @ceil_v4f32(<4 x float> %a) { %v = call <4 x float> @llvm.wasm.ceil.v4f32(<4 x float> %a) @@ -808,9 +808,9 @@ define <4 x float> @ceil_v4f32(<4 x float> %a) { } ; CHECK-LABEL: floor_v4f32: -; SIMD128-NEXT: .functype floor_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.floor $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype floor_v4f32 (v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.floor $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.floor.v4f32(<4 x float>) define <4 x float> @floor_v4f32(<4 x float> %a) { %v = call <4 x float> @llvm.wasm.floor.v4f32(<4 x float> %a) @@ -818,9 +818,9 @@ define <4 x float> @floor_v4f32(<4 x float> %a) { } ; CHECK-LABEL: trunc_v4f32: -; SIMD128-NEXT: .functype trunc_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.trunc $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype trunc_v4f32 (v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.trunc $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.trunc.v4f32(<4 x float>) define <4 x float> @trunc_v4f32(<4 x float> %a) { %v = call <4 x float> @llvm.wasm.trunc.v4f32(<4 x float> %a) @@ -828,9 +828,9 @@ define <4 x float> @trunc_v4f32(<4 x float> %a) { } ; CHECK-LABEL: nearest_v4f32: -; SIMD128-NEXT: .functype nearest_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.nearest $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype nearest_v4f32 (v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.nearest $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.nearest.v4f32(<4 x float>) define <4 x float> @nearest_v4f32(<4 x float> %a) { %v = call <4 x float> @llvm.wasm.nearest.v4f32(<4 x float> %a) @@ -838,9 +838,9 @@ define <4 x float> @nearest_v4f32(<4 x float> %a) { } ; CHECK-LABEL: qfma_v4f32: -; SIMD128-NEXT: .functype qfma_v4f32 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.qfma $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype qfma_v4f32 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.qfma $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.qfma.v4f32(<4 x float>, <4 x float>, <4 x float>) define <4 x float> @qfma_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { %v = call <4 x float> @llvm.wasm.qfma.v4f32( @@ -850,9 +850,9 @@ define <4 x float> @qfma_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { } ; CHECK-LABEL: qfms_v4f32: -; SIMD128-NEXT: .functype qfms_v4f32 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.qfms $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype qfms_v4f32 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.qfms $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.qfms.v4f32(<4 x float>, <4 x float>, <4 x float>) define <4 x float> @qfms_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { %v = call <4 x float> @llvm.wasm.qfms.v4f32( @@ -862,9 +862,9 @@ define <4 x float> @qfms_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { } ; CHECK-LABEL: demote_zero_v4f32: -; SIMD128-NEXT: .functype demote_zero_v4f32 (v128) -> (v128){{$}} -; SIMD128-NEXT: f32x4.demote_zero_f64x2 $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype demote_zero_v4f32 (v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.demote_zero_f64x2 $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <4 x float> @llvm.wasm.demote.zero(<2 x double>) define <4 x float> @demote_zero_v4f32(<2 x double> %a) { %v = call <4 x float> @llvm.wasm.demote.zero(<2 x double> %a) @@ -875,9 +875,9 @@ define <4 x float> @demote_zero_v4f32(<2 x double> %a) { ; 2 x f64 ; ============================================================================== ; CHECK-LABEL: bitselect_v2f64: -; SIMD128-NEXT: .functype bitselect_v2f64 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype bitselect_v2f64 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.bitselect.v2f64(<2 x double>, <2 x double>, <2 x double>) define <2 x double> @bitselect_v2f64(<2 x double> %v1, <2 x double> %v2, <2 x double> %c) { %a = call <2 x double> @llvm.wasm.bitselect.v2f64( @@ -887,9 +887,9 @@ define <2 x double> @bitselect_v2f64(<2 x double> %v1, <2 x double> %v2, <2 x do } ; CHECK-LABEL: pmin_v2f64: -; SIMD128-NEXT: .functype pmin_v2f64 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype pmin_v2f64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.pmin $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.pmin.v2f64(<2 x double>, <2 x double>) define <2 x double> @pmin_v2f64(<2 x double> %a, <2 x double> %b) { %v = call <2 x double> @llvm.wasm.pmin.v2f64(<2 x double> %a, <2 x double> %b) @@ -897,9 +897,9 @@ define <2 x double> @pmin_v2f64(<2 x double> %a, <2 x double> %b) { } ; CHECK-LABEL: pmax_v2f64: -; SIMD128-NEXT: .functype pmax_v2f64 (v128, v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype pmax_v2f64 (v128, v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.pmax $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.pmax.v2f64(<2 x double>, <2 x double>) define <2 x double> @pmax_v2f64(<2 x double> %a, <2 x double> %b) { %v = call <2 x double> @llvm.wasm.pmax.v2f64(<2 x double> %a, <2 x double> %b) @@ -907,9 +907,9 @@ define <2 x double> @pmax_v2f64(<2 x double> %a, <2 x double> %b) { } ; CHECK-LABEL: ceil_v2f64: -; SIMD128-NEXT: .functype ceil_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.ceil $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype ceil_v2f64 (v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.ceil $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.ceil.v2f64(<2 x double>) define <2 x double> @ceil_v2f64(<2 x double> %a) { %v = call <2 x double> @llvm.wasm.ceil.v2f64(<2 x double> %a) @@ -917,9 +917,9 @@ define <2 x double> @ceil_v2f64(<2 x double> %a) { } ; CHECK-LABEL: floor_v2f64: -; SIMD128-NEXT: .functype floor_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.floor $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype floor_v2f64 (v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.floor $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.floor.v2f64(<2 x double>) define <2 x double> @floor_v2f64(<2 x double> %a) { %v = call <2 x double> @llvm.wasm.floor.v2f64(<2 x double> %a) @@ -927,9 +927,9 @@ define <2 x double> @floor_v2f64(<2 x double> %a) { } ; CHECK-LABEL: trunc_v2f64: -; SIMD128-NEXT: .functype trunc_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.trunc $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype trunc_v2f64 (v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.trunc $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.trunc.v2f64(<2 x double>) define <2 x double> @trunc_v2f64(<2 x double> %a) { %v = call <2 x double> @llvm.wasm.trunc.v2f64(<2 x double> %a) @@ -937,9 +937,9 @@ define <2 x double> @trunc_v2f64(<2 x double> %a) { } ; CHECK-LABEL: nearest_v2f64: -; SIMD128-NEXT: .functype nearest_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.nearest $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype nearest_v2f64 (v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.nearest $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.nearest.v2f64(<2 x double>) define <2 x double> @nearest_v2f64(<2 x double> %a) { %v = call <2 x double> @llvm.wasm.nearest.v2f64(<2 x double> %a) @@ -947,9 +947,9 @@ define <2 x double> @nearest_v2f64(<2 x double> %a) { } ; CHECK-LABEL: qfma_v2f64: -; SIMD128-NEXT: .functype qfma_v2f64 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.qfma $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype qfma_v2f64 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.qfma $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.qfma.v2f64(<2 x double>, <2 x double>, <2 x double>) define <2 x double> @qfma_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { %v = call <2 x double> @llvm.wasm.qfma.v2f64( @@ -959,9 +959,9 @@ define <2 x double> @qfma_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> % } ; CHECK-LABEL: qfms_v2f64: -; SIMD128-NEXT: .functype qfms_v2f64 (v128, v128, v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.qfms $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype qfms_v2f64 (v128, v128, v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.qfms $push[[R:[0-9]+]]=, $0, $1, $2{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.qfms.v2f64(<2 x double>, <2 x double>, <2 x double>) define <2 x double> @qfms_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { %v = call <2 x double> @llvm.wasm.qfms.v2f64( @@ -971,9 +971,9 @@ define <2 x double> @qfms_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> % } ; CHECK-LABEL: convert_low_signed_v2f64: -; SIMD128-NEXT: .functype convert_low_signed_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.convert_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype convert_low_signed_v2f64 (v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.convert_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.convert.low.signed(<4 x i32>) define <2 x double> @convert_low_signed_v2f64(<4 x i32> %a) { %v = call <2 x double> @llvm.wasm.convert.low.signed(<4 x i32> %a) @@ -981,9 +981,9 @@ define <2 x double> @convert_low_signed_v2f64(<4 x i32> %a) { } ; CHECK-LABEL: convert_low_unsigned_v2f64: -; SIMD128-NEXT: .functype convert_low_unsigned_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.convert_low_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype convert_low_unsigned_v2f64 (v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.convert_low_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.convert.low.unsigned(<4 x i32>) define <2 x double> @convert_low_unsigned_v2f64(<4 x i32> %a) { %v = call <2 x double> @llvm.wasm.convert.low.unsigned(<4 x i32> %a) @@ -991,9 +991,9 @@ define <2 x double> @convert_low_unsigned_v2f64(<4 x i32> %a) { } ; CHECK-LABEL: promote_low_v2f64: -; SIMD128-NEXT: .functype promote_low_v2f64 (v128) -> (v128){{$}} -; SIMD128-NEXT: f64x2.promote_low_f32x4 $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype promote_low_v2f64 (v128) -> (v128){{$}} +; CHECK-NEXT: f64x2.promote_low_f32x4 $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} declare <2 x double> @llvm.wasm.promote.low(<4 x float>) define <2 x double> @promote_low_v2f64(<4 x float> %a) { %v = call <2 x double> @llvm.wasm.promote.low(<4 x float> %a) diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll b/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll index 3d08a586edb5..f976ac2630d5 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-load-splat.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+unimplemented-simd128 | FileCheck %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-keep-registers -wasm-disable-explicit-locals -mattr=+simd128 | FileCheck %s ; Regression test for an ISel failure when a splatted load had more ; than one use. The main tests for load_splat are in simd-offset.ll. diff --git a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll index 000b7730e3bf..9e16392c396e 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-load-store-alignment.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s ; Test loads and stores with custom alignment values. diff --git a/llvm/test/CodeGen/WebAssembly/simd-noopt.ll b/llvm/test/CodeGen/WebAssembly/simd-noopt.ll deleted file mode 100644 index 1ec259ccca73..000000000000 --- a/llvm/test/CodeGen/WebAssembly/simd-noopt.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llc < %s -fast-isel -mattr=+simd128,+sign-ext -verify-machineinstrs - -;; Ensures fastisel produces valid code when storing and loading split -;; up v2i64 values. Lowering away v2i64s is a temporary measure while -;; V8 does not have support for i64x2.* operations, and is done when -;; -wasm-enable-unimplemented-simd is not present. This is a -;; regression test for a bug that crashed llc after fastisel produced -;; machineinstrs that used registers that had never been defined. - -target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" -target triple = "wasm32-unknown-unknown" - -define i64 @foo(<2 x i64> %vec) { -entry: - %vec.addr = alloca <2 x i64>, align 16 - store <2 x i64> %vec, <2 x i64>* %vec.addr, align 16 - %0 = load <2 x i64>, <2 x i64>* %vec.addr, align 16 - %1 = extractelement <2 x i64> %0, i32 0 - ret i64 %1 -} diff --git a/llvm/test/CodeGen/WebAssembly/simd-reductions.ll b/llvm/test/CodeGen/WebAssembly/simd-reductions.ll index 037cac514b03..259ef3b3a81f 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-reductions.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-reductions.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes CHECK,SIMD128 +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s ; Tests that redundant masking and conversions are folded out ; following SIMD reduction instructions. @@ -13,9 +13,9 @@ declare i32 @llvm.wasm.anytrue.v16i8(<16 x i8>) declare i32 @llvm.wasm.alltrue.v16i8(<16 x i8>) ; CHECK-LABEL: any_v16i8_trunc: -; SIMD128-NEXT: .functype any_v16i8_trunc (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v16i8_trunc (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v16i8_trunc(<16 x i8> %x) { %a = call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> %x) %b = trunc i32 %a to i1 @@ -24,9 +24,9 @@ define i32 @any_v16i8_trunc(<16 x i8> %x) { } ; CHECK-LABEL: any_v16i8_ne: -; SIMD128-NEXT: .functype any_v16i8_ne (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v16i8_ne (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v16i8_ne(<16 x i8> %x) { %a = call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> %x) %b = icmp ne i32 %a, 0 @@ -35,9 +35,9 @@ define i32 @any_v16i8_ne(<16 x i8> %x) { } ; CHECK-LABEL: any_v16i8_eq: -; SIMD128-NEXT: .functype any_v16i8_eq (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v16i8_eq (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v16i8_eq(<16 x i8> %x) { %a = call i32 @llvm.wasm.anytrue.v16i8(<16 x i8> %x) %b = icmp eq i32 %a, 1 @@ -46,9 +46,9 @@ define i32 @any_v16i8_eq(<16 x i8> %x) { } ; CHECK-LABEL: all_v16i8_trunc: -; SIMD128-NEXT: .functype all_v16i8_trunc (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v16i8_trunc (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v16i8_trunc(<16 x i8> %x) { %a = call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> %x) %b = trunc i32 %a to i1 @@ -57,9 +57,9 @@ define i32 @all_v16i8_trunc(<16 x i8> %x) { } ; CHECK-LABEL: all_v16i8_ne: -; SIMD128-NEXT: .functype all_v16i8_ne (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v16i8_ne (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v16i8_ne(<16 x i8> %x) { %a = call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> %x) %b = icmp ne i32 %a, 0 @@ -68,9 +68,9 @@ define i32 @all_v16i8_ne(<16 x i8> %x) { } ; CHECK-LABEL: all_v16i8_eq: -; SIMD128-NEXT: .functype all_v16i8_eq (v128) -> (i32){{$}} -; SIMD128-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v16i8_eq (v128) -> (i32){{$}} +; CHECK-NEXT: i8x16.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v16i8_eq(<16 x i8> %x) { %a = call i32 @llvm.wasm.alltrue.v16i8(<16 x i8> %x) %b = icmp eq i32 %a, 1 @@ -85,9 +85,9 @@ declare i32 @llvm.wasm.anytrue.v8i16(<8 x i16>) declare i32 @llvm.wasm.alltrue.v8i16(<8 x i16>) ; CHECK-LABEL: any_v8i16_trunc: -; SIMD128-NEXT: .functype any_v8i16_trunc (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v8i16_trunc (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v8i16_trunc(<8 x i16> %x) { %a = call i32 @llvm.wasm.anytrue.v8i16(<8 x i16> %x) %b = trunc i32 %a to i1 @@ -96,9 +96,9 @@ define i32 @any_v8i16_trunc(<8 x i16> %x) { } ; CHECK-LABEL: any_v8i16_ne: -; SIMD128-NEXT: .functype any_v8i16_ne (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v8i16_ne (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v8i16_ne(<8 x i16> %x) { %a = call i32 @llvm.wasm.anytrue.v8i16(<8 x i16> %x) %b = icmp ne i32 %a, 0 @@ -107,9 +107,9 @@ define i32 @any_v8i16_ne(<8 x i16> %x) { } ; CHECK-LABEL: any_v8i16_eq: -; SIMD128-NEXT: .functype any_v8i16_eq (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v8i16_eq (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v8i16_eq(<8 x i16> %x) { %a = call i32 @llvm.wasm.anytrue.v8i16(<8 x i16> %x) %b = icmp eq i32 %a, 1 @@ -118,9 +118,9 @@ define i32 @any_v8i16_eq(<8 x i16> %x) { } ; CHECK-LABEL: all_v8i16_trunc: -; SIMD128-NEXT: .functype all_v8i16_trunc (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v8i16_trunc (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v8i16_trunc(<8 x i16> %x) { %a = call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> %x) %b = trunc i32 %a to i1 @@ -129,9 +129,9 @@ define i32 @all_v8i16_trunc(<8 x i16> %x) { } ; CHECK-LABEL: all_v8i16_ne: -; SIMD128-NEXT: .functype all_v8i16_ne (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v8i16_ne (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v8i16_ne(<8 x i16> %x) { %a = call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> %x) %b = icmp ne i32 %a, 0 @@ -140,9 +140,9 @@ define i32 @all_v8i16_ne(<8 x i16> %x) { } ; CHECK-LABEL: all_v8i16_eq: -; SIMD128-NEXT: .functype all_v8i16_eq (v128) -> (i32){{$}} -; SIMD128-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v8i16_eq (v128) -> (i32){{$}} +; CHECK-NEXT: i16x8.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v8i16_eq(<8 x i16> %x) { %a = call i32 @llvm.wasm.alltrue.v8i16(<8 x i16> %x) %b = icmp eq i32 %a, 1 @@ -157,9 +157,9 @@ declare i32 @llvm.wasm.anytrue.v4i32(<4 x i32>) declare i32 @llvm.wasm.alltrue.v4i32(<4 x i32>) ; CHECK-LABEL: any_v4i32_trunc: -; SIMD128-NEXT: .functype any_v4i32_trunc (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v4i32_trunc (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v4i32_trunc(<4 x i32> %x) { %a = call i32 @llvm.wasm.anytrue.v4i32(<4 x i32> %x) %b = trunc i32 %a to i1 @@ -168,9 +168,9 @@ define i32 @any_v4i32_trunc(<4 x i32> %x) { } ; CHECK-LABEL: any_v4i32_ne: -; SIMD128-NEXT: .functype any_v4i32_ne (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v4i32_ne (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v4i32_ne(<4 x i32> %x) { %a = call i32 @llvm.wasm.anytrue.v4i32(<4 x i32> %x) %b = icmp ne i32 %a, 0 @@ -179,9 +179,9 @@ define i32 @any_v4i32_ne(<4 x i32> %x) { } ; CHECK-LABEL: any_v4i32_eq: -; SIMD128-NEXT: .functype any_v4i32_eq (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v4i32_eq (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v4i32_eq(<4 x i32> %x) { %a = call i32 @llvm.wasm.anytrue.v4i32(<4 x i32> %x) %b = icmp eq i32 %a, 1 @@ -190,9 +190,9 @@ define i32 @any_v4i32_eq(<4 x i32> %x) { } ; CHECK-LABEL: all_v4i32_trunc: -; SIMD128-NEXT: .functype all_v4i32_trunc (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v4i32_trunc (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v4i32_trunc(<4 x i32> %x) { %a = call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> %x) %b = trunc i32 %a to i1 @@ -201,9 +201,9 @@ define i32 @all_v4i32_trunc(<4 x i32> %x) { } ; CHECK-LABEL: all_v4i32_ne: -; SIMD128-NEXT: .functype all_v4i32_ne (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v4i32_ne (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v4i32_ne(<4 x i32> %x) { %a = call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> %x) %b = icmp ne i32 %a, 0 @@ -212,9 +212,9 @@ define i32 @all_v4i32_ne(<4 x i32> %x) { } ; CHECK-LABEL: all_v4i32_eq: -; SIMD128-NEXT: .functype all_v4i32_eq (v128) -> (i32){{$}} -; SIMD128-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v4i32_eq (v128) -> (i32){{$}} +; CHECK-NEXT: i32x4.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v4i32_eq(<4 x i32> %x) { %a = call i32 @llvm.wasm.alltrue.v4i32(<4 x i32> %x) %b = icmp eq i32 %a, 1 @@ -229,9 +229,9 @@ declare i32 @llvm.wasm.anytrue.v2i64(<2 x i64>) declare i32 @llvm.wasm.alltrue.v2i64(<2 x i64>) ; CHECK-LABEL: any_v2i64_trunc: -; SIMD128-NEXT: .functype any_v2i64_trunc (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v2i64_trunc (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v2i64_trunc(<2 x i64> %x) { %a = call i32 @llvm.wasm.anytrue.v2i64(<2 x i64> %x) %b = trunc i32 %a to i1 @@ -240,9 +240,9 @@ define i32 @any_v2i64_trunc(<2 x i64> %x) { } ; CHECK-LABEL: any_v2i64_ne: -; SIMD128-NEXT: .functype any_v2i64_ne (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v2i64_ne (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v2i64_ne(<2 x i64> %x) { %a = call i32 @llvm.wasm.anytrue.v2i64(<2 x i64> %x) %b = icmp ne i32 %a, 0 @@ -251,9 +251,9 @@ define i32 @any_v2i64_ne(<2 x i64> %x) { } ; CHECK-LABEL: any_v2i64_eq: -; SIMD128-NEXT: .functype any_v2i64_eq (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype any_v2i64_eq (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.any_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @any_v2i64_eq(<2 x i64> %x) { %a = call i32 @llvm.wasm.anytrue.v2i64(<2 x i64> %x) %b = icmp eq i32 %a, 1 @@ -262,9 +262,9 @@ define i32 @any_v2i64_eq(<2 x i64> %x) { } ; CHECK-LABEL: all_v2i64_trunc: -; SIMD128-NEXT: .functype all_v2i64_trunc (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v2i64_trunc (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v2i64_trunc(<2 x i64> %x) { %a = call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> %x) %b = trunc i32 %a to i1 @@ -273,9 +273,9 @@ define i32 @all_v2i64_trunc(<2 x i64> %x) { } ; CHECK-LABEL: all_v2i64_ne: -; SIMD128-NEXT: .functype all_v2i64_ne (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v2i64_ne (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v2i64_ne(<2 x i64> %x) { %a = call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> %x) %b = icmp ne i32 %a, 0 @@ -284,9 +284,9 @@ define i32 @all_v2i64_ne(<2 x i64> %x) { } ; CHECK-LABEL: all_v2i64_eq: -; SIMD128-NEXT: .functype all_v2i64_eq (v128) -> (i32){{$}} -; SIMD128-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}} -; SIMD128-NEXT: return $pop[[R]]{{$}} +; CHECK-NEXT: .functype all_v2i64_eq (v128) -> (i32){{$}} +; CHECK-NEXT: i64x2.all_true $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-NEXT: return $pop[[R]]{{$}} define i32 @all_v2i64_eq(<2 x i64> %x) { %a = call i32 @llvm.wasm.alltrue.v2i64(<2 x i64> %x) %b = icmp eq i32 %a, 1 diff --git a/llvm/test/CodeGen/WebAssembly/simd-select.ll b/llvm/test/CodeGen/WebAssembly/simd-select.ll index be36f94cf5a6..19d11328154e 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-select.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-select.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -verify-machineinstrs -mattr=+unimplemented-simd128 | FileCheck %s +; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s ; Test that vector selects of various varieties lower correctly. diff --git a/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll b/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll index 13f9ca14812d..c2f00b349489 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-sext-inreg.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=+unimplemented-simd128 | FileCheck %s --check-prefixes CHECK,SIMD128 +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128 ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -wasm-keep-registers -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals | FileCheck %s --check-prefixes CHECK,NO-SIMD128 ; Test that vector sign extensions lower to shifts diff --git a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll index dbe426b07515..1fc0a92b9032 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-unsupported.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128 | FileCheck %s +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s ; Test that operations that are not supported by SIMD are properly ; unrolled. diff --git a/llvm/test/CodeGen/WebAssembly/simd.ll b/llvm/test/CodeGen/WebAssembly/simd.ll index c8053293ebac..b915b83cc132 100644 --- a/llvm/test/CodeGen/WebAssembly/simd.ll +++ b/llvm/test/CodeGen/WebAssembly/simd.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+unimplemented-simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128 -; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128-VM +; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128,+sign-ext | FileCheck %s --check-prefixes CHECK,SIMD128 ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefixes CHECK,NO-SIMD128 ; Test that basic SIMD128 vector manipulation operations assemble as expected. @@ -12,7 +11,6 @@ target triple = "wasm32-unknown-unknown" ; ============================================================================== ; CHECK-LABEL: const_v16i8: ; NO-SIMD128-NOT: i8x16 -; SIMD128-VM-NOT: v128.const ; SIMD128-NEXT: .functype const_v16i8 () -> (v128){{$}} ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, ; SIMD128-SAME: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 @@ -277,7 +275,6 @@ define <16 x i8> @build_v16i8(i8 %x0, i8 %x1, i8 %x2, i8 %x3, ; ============================================================================== ; CHECK-LABEL: const_v8i16: ; NO-SIMD128-NOT: i16x8 -; SIMD128-VM-NOT: v128.const ; SIMD128-NEXT: .functype const_v8i16 () -> (v128){{$}} ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 256, 770, 1284, 1798, 2312, 2826, 3340, 3854{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} @@ -526,7 +523,6 @@ define <8 x i16> @build_v8i16(i16 %x0, i16 %x1, i16 %x2, i16 %x3, ; ============================================================================== ; CHECK-LABEL: const_v4i32: ; NO-SIMD128-NOT: i32x4 -; SIMD128-VM-NOT: v128.const ; SIMD128-NEXT: .functype const_v4i32 () -> (v128){{$}} ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 50462976, 117835012, 185207048, 252579084{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} @@ -678,7 +674,6 @@ define <4 x i32> @build_v4i32(i32 %x0, i32 %x1, i32 %x2, i32 %x3) { ; ============================================================================== ; CHECK-LABEL: const_v2i64: ; NO-SIMD128-NOT: i64x2 -; SIMD128-VM-NOT: v128.const ; SIMD128-NEXT: .functype const_v2i64 () -> (v128){{$}} ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 506097522914230528, 1084818905618843912{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} @@ -824,7 +819,6 @@ define <2 x i64> @build_v2i64(i64 %x0, i64 %x1) { ; ============================================================================== ; CHECK-LABEL: const_v4f32: ; NO-SIMD128-NOT: f32x4 -; SIMD128-VM-NOT: v128.const ; SIMD128-NEXT: .functype const_v4f32 () -> (v128){{$}} ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, ; SIMD128-SAME: 0x1.0402p-121, 0x1.0c0a08p-113, 0x1.14121p-105, 0x1.1c1a18p-97{{$}} @@ -978,7 +972,6 @@ define <4 x float> @build_v4f32(float %x0, float %x1, float %x2, float %x3) { ; ============================================================================== ; CHECK-LABEL: const_v2f64: ; NO-SIMD128-NOT: f64x2 -; SIMD128-VM-NOT: v128.const ; SIMD128-NEXT: .functype const_v2f64 () -> (v128){{$}} ; SIMD128-NEXT: v128.const $push[[R:[0-9]+]]=, 0x1.60504030201p-911, 0x1.e0d0c0b0a0908p-783{{$}} ; SIMD128-NEXT: return $pop[[R]]{{$}} diff --git a/llvm/test/MC/WebAssembly/basic-assembly.s b/llvm/test/MC/WebAssembly/basic-assembly.s index 2faddd215433..a9ebe225f965 100644 --- a/llvm/test/MC/WebAssembly/basic-assembly.s +++ b/llvm/test/MC/WebAssembly/basic-assembly.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+reference-types,atomics,+unimplemented-simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s +# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+reference-types,atomics,+simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s # Check that it converts to .o without errors, but don't check any output: -# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+reference-types,+atomics,+unimplemented-simd128,+nontrapping-fptoint,+exception-handling -o %t.o < %s +# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+reference-types,+atomics,+simd128,+nontrapping-fptoint,+exception-handling -o %t.o < %s empty_func: diff --git a/llvm/test/MC/WebAssembly/data-section.s b/llvm/test/MC/WebAssembly/data-section.s index 71c62e5f0b16..f948f523d5a2 100644 --- a/llvm/test/MC/WebAssembly/data-section.s +++ b/llvm/test/MC/WebAssembly/data-section.s @@ -1,12 +1,12 @@ -# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+unimplemented-simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s +# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s # Check that it converts to .o without errors: -# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+unimplemented-simd128,+nontrapping-fptoint,+exception-handling < %s | obj2yaml | FileCheck -check-prefixes=BIN,BIN32 %s +# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s | obj2yaml | FileCheck -check-prefixes=BIN,BIN32 %s # Same again for wasm64 -# RUN: llvm-mc -triple=wasm64-unknown-unknown -mattr=+unimplemented-simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s +# RUN: llvm-mc -triple=wasm64-unknown-unknown -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s # Check that it converts to .o without errors: -# RUN: llvm-mc -triple=wasm64-unknown-unknown -filetype=obj -mattr=+unimplemented-simd128,+nontrapping-fptoint,+exception-handling < %s | obj2yaml | FileCheck -check-prefixes=BIN,BIN64 %s +# RUN: llvm-mc -triple=wasm64-unknown-unknown -filetype=obj -mattr=+simd128,+nontrapping-fptoint,+exception-handling < %s | obj2yaml | FileCheck -check-prefixes=BIN,BIN64 %s # Minimal test for data sections. diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index 099c6489e703..4ecf5e487665 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -1,4 +1,4 @@ -# RUN: llvm-mc -show-encoding -triple=wasm32-unknown-unknown -mattr=+unimplemented-simd128 < %s | FileCheck %s +# RUN: llvm-mc -show-encoding -triple=wasm32-unknown-unknown -mattr=+simd128 < %s | FileCheck %s main: .functype main () -> () diff --git a/llvm/test/MC/WebAssembly/type-index.s b/llvm/test/MC/WebAssembly/type-index.s index eef1a1012466..7c3ab80c5b83 100644 --- a/llvm/test/MC/WebAssembly/type-index.s +++ b/llvm/test/MC/WebAssembly/type-index.s @@ -1,6 +1,6 @@ -# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+reference-types,+unimplemented-simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s +# RUN: llvm-mc -triple=wasm32-unknown-unknown -mattr=+reference-types,+simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s # Check that it converts to .o without errors: -# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+reference-types,+unimplemented-simd128,+nontrapping-fptoint,+exception-handling < %s | obj2yaml | FileCheck -check-prefix=BIN %s +# RUN: llvm-mc -triple=wasm32-unknown-unknown -filetype=obj -mattr=+reference-types,+simd128,+nontrapping-fptoint,+exception-handling < %s | obj2yaml | FileCheck -check-prefix=BIN %s # Minimal test for type indices and table references in call_indirect. diff --git a/llvm/test/MC/WebAssembly/types.ll b/llvm/test/MC/WebAssembly/types.ll index c049d3ce0e82..0c49f888fee4 100644 --- a/llvm/test/MC/WebAssembly/types.ll +++ b/llvm/test/MC/WebAssembly/types.ll @@ -1,4 +1,4 @@ -; RUN: llc -mattr=+unimplemented-simd128 -filetype=obj %s -o - | obj2yaml | FileCheck %s +; RUN: llc -mattr=+simd128 -filetype=obj %s -o - | obj2yaml | FileCheck %s target triple = "wasm32-unknown-unknown" diff --git a/llvm/test/MC/WebAssembly/wasm64.s b/llvm/test/MC/WebAssembly/wasm64.s index 793f91f11af9..5cb64403569e 100644 --- a/llvm/test/MC/WebAssembly/wasm64.s +++ b/llvm/test/MC/WebAssembly/wasm64.s @@ -1,5 +1,5 @@ -# RUN: llvm-mc -triple=wasm64-unknown-unknown -mattr=+atomics,+unimplemented-simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s -# RUN: llvm-mc -triple=wasm64-unknown-unknown -filetype=obj -mattr=+atomics,+unimplemented-simd128,+nontrapping-fptoint,+exception-handling -o - < %s | obj2yaml | FileCheck %s -check-prefix=BIN +# RUN: llvm-mc -triple=wasm64-unknown-unknown -mattr=+atomics,+simd128,+nontrapping-fptoint,+exception-handling < %s | FileCheck %s +# RUN: llvm-mc -triple=wasm64-unknown-unknown -filetype=obj -mattr=+atomics,+simd128,+nontrapping-fptoint,+exception-handling -o - < %s | obj2yaml | FileCheck %s -check-prefix=BIN # Most of our other tests are for wasm32, this one adds some wasm64 specific tests. -- GitLab From 6a9e7b117ba9b378429d5d5434c65d4872f99b35 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Tue, 2 Mar 2021 12:44:43 -0500 Subject: [PATCH 0063/1000] [libc++] Remove the Docker files for BuildBot We don't use them anymore since we're using the BuildKite setup. Differential Revision: https://reviews.llvm.org/D97779 --- libcxx/utils/docker/README.txt | 0 .../utils/docker/debian9/buildbot/Dockerfile | 39 ------ .../debian9/buildbot/buildbot-auth.json | 4 - .../debian9/buildbot/docker-compose.yml | 19 --- .../debian9/buildbot/install-gcloud-agents.sh | 11 -- .../debian9/buildbot/install-packages.sh | 40 ------- .../docker/debian9/buildbot/run_buildbot.sh | 111 ------------------ 7 files changed, 224 deletions(-) delete mode 100644 libcxx/utils/docker/README.txt delete mode 100644 libcxx/utils/docker/debian9/buildbot/Dockerfile delete mode 100644 libcxx/utils/docker/debian9/buildbot/buildbot-auth.json delete mode 100644 libcxx/utils/docker/debian9/buildbot/docker-compose.yml delete mode 100755 libcxx/utils/docker/debian9/buildbot/install-gcloud-agents.sh delete mode 100755 libcxx/utils/docker/debian9/buildbot/install-packages.sh delete mode 100755 libcxx/utils/docker/debian9/buildbot/run_buildbot.sh diff --git a/libcxx/utils/docker/README.txt b/libcxx/utils/docker/README.txt deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/libcxx/utils/docker/debian9/buildbot/Dockerfile b/libcxx/utils/docker/debian9/buildbot/Dockerfile deleted file mode 100644 index 7da50687b952..000000000000 --- a/libcxx/utils/docker/debian9/buildbot/Dockerfile +++ /dev/null @@ -1,39 +0,0 @@ - -#===-------------------------------------------------------------------------------------------===// -# buildslave -#===-------------------------------------------------------------------------------------------===// -ARG gcc_tot -ARG llvm_tot - -FROM ${gcc_tot} AS gcc-tot -FROM ${llvm_tot} AS llvm-tot - -FROM debian:stretch AS base-image - -ADD install-packages.sh /tmp/ -RUN /tmp/install-packages.sh && rm /tmp/install-packages.sh - -COPY --from=ericwf/gcc:5.5.0 /compiler /opt/gcc-5 - -FROM base-image as worker-image - -COPY --from=gcc-tot /compiler /opt/gcc-tot -COPY --from=llvm-tot /compiler /opt/llvm-tot - -ENV PATH /opt/llvm-tot/bin:$PATH - -RUN clang++ --version && echo hello -RUN g++ --version - - -RUN /opt/gcc-tot/bin/g++ --version -RUN /opt/llvm-tot/bin/clang++ --version -RUN /opt/llvm-tot/bin/clang --version - -# FIXME(EricWF): remove this once the buildbot's config doesn't clobber the path. -RUN ln -s /opt/llvm-tot/bin/clang /usr/local/bin/clang -RUN ln -s /opt/llvm-tot/bin/clang++ /usr/local/bin/clang++ - - -ADD run_buildbot.sh / -CMD /run_buildbot.sh /run/secrets/buildbot-auth diff --git a/libcxx/utils/docker/debian9/buildbot/buildbot-auth.json b/libcxx/utils/docker/debian9/buildbot/buildbot-auth.json deleted file mode 100644 index 5e91e2d4158f..000000000000 --- a/libcxx/utils/docker/debian9/buildbot/buildbot-auth.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "login": "", - "password": "" -} diff --git a/libcxx/utils/docker/debian9/buildbot/docker-compose.yml b/libcxx/utils/docker/debian9/buildbot/docker-compose.yml deleted file mode 100644 index bd61dea4871c..000000000000 --- a/libcxx/utils/docker/debian9/buildbot/docker-compose.yml +++ /dev/null @@ -1,19 +0,0 @@ -version: '3.7' -services: - llvm-buildbot-worker: - build: - context: https://github.com/llvm/llvm-project.git#master:libcxx/utils/docker/debian9/buildbot - args: - gcc_tot: "ericwf/gcc:9.2.0" - llvm_tot: "ericwf/llvm:11.x" - image: llvm-buildbot-worker - volumes: - - /var/run/docker.sock:/var/run/docker.sock - secrets: - - buildbot-auth - logging: - driver: gcplogs - -secrets: - buildbot-auth: - file: buildbot-auth.json diff --git a/libcxx/utils/docker/debian9/buildbot/install-gcloud-agents.sh b/libcxx/utils/docker/debian9/buildbot/install-gcloud-agents.sh deleted file mode 100755 index d2656ca5092a..000000000000 --- a/libcxx/utils/docker/debian9/buildbot/install-gcloud-agents.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/env bash - -cd /tmp/ - -curl -sSO https://dl.google.com/cloudagents/install-monitoring-agent.sh -sudo bash install-monitoring-agent.sh -rm install-monitoring-agent.sh - -curl -sSO https://dl.google.com/cloudagents/install-logging-agent.sh -sudo bash install-logging-agent.sh -rm install-logging-agent.sh diff --git a/libcxx/utils/docker/debian9/buildbot/install-packages.sh b/libcxx/utils/docker/debian9/buildbot/install-packages.sh deleted file mode 100755 index 56e7c00d4930..000000000000 --- a/libcxx/utils/docker/debian9/buildbot/install-packages.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -set -x -set -e - -apt-get update && \ - apt-get install -y --no-install-recommends \ - buildbot-slave \ - ca-certificates \ - gnupg \ - build-essential \ - wget \ - unzip \ - python \ - ninja-build \ - curl \ - git \ - gcc-multilib \ - g++-multilib \ - libc6-dev \ - libtool \ - locales-all \ - binutils-dev \ - binutils-gold \ - software-properties-common \ - gnupg \ - apt-transport-https \ - sudo \ - bash-completion \ - vim \ - jq \ - systemd \ - sysvinit-utils \ - systemd-sysv && \ - rm -rf /var/lib/apt/lists/* - -# Install a recent CMake -yes | apt-get purge cmake -wget https://github.com/Kitware/CMake/releases/download/v3.18.2/cmake-3.18.2-Linux-x86_64.sh -O /tmp/install-cmake.sh -bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license diff --git a/libcxx/utils/docker/debian9/buildbot/run_buildbot.sh b/libcxx/utils/docker/debian9/buildbot/run_buildbot.sh deleted file mode 100755 index e008a30558c9..000000000000 --- a/libcxx/utils/docker/debian9/buildbot/run_buildbot.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env bash -set -x - -readonly BOT_ROOT=/b -readonly AUTH_FILE=$1 -readonly BOT_ROOT_NAME=$(jq -r ".login" $AUTH_FILE) - -systemctl daemon-reload -service buildslave stop -mkdir -p /b -rm -rf /b/* -service buildslave stop - -pushd /tmp/ - -curl -sSO https://dl.google.com/cloudagents/install-monitoring-agent.sh -sudo bash install-monitoring-agent.sh -rm install-monitoring-agent.sh - -curl -sSO https://dl.google.com/cloudagents/install-logging-agent.sh -sudo bash install-logging-agent.sh -rm install-logging-agent.sh - -popd - - -systemctl set-property buildslave.service TasksMax=100000 - -function setup_numbered_bot() { - local BOT_NAME=$1 - local BOT_DIR=$2 - mkdir -p $BOT_DIR - - buildslave stop $BOT_DIR - chown buildbot $BOT_DIR - rm -rf $BOT_DIR/* - - buildslave create-slave --allow-shutdown=signal "$BOT_DIR" "lab.llvm.org:9990" "$BOT_NAME" $(jq -r ".password" $AUTH_FILE) - - echo "Eric Fiselier " > $BOT_DIR/info/admin - - echo "Connecting as $1" - { - uname -a | head -n1 - cmake --version | head -n1 - g++ --version | head -n1 - clang++ --version | head -n1 - ld --version | head -n1 - date - lscpu - } > $BOT_DIR/info/host - - -#echo "SLAVE_RUNNER=/usr/bin/buildslave -#SLAVE_ENABLED[1]=\"1\" -#SLAVE_NAME[1]=\"$BOT_NAME\" -#SLAVE_USER[1]=\"buildbot\" -#SLAVE_BASEDIR[1]=\"$BOT_DIR\" -#SLAVE_OPTIONS[1]=\"\" -#SLAVE_PREFIXCMD[1]=\"\"" > $BOT_DIR/buildslave.cfg - - ls $BOT_DIR/ - cat $BOT_DIR/buildbot.tac -} - -function try_start_builder { - local N=$1 - local BOT_DIR="$BOT_ROOT/b$N" - local BOT_NAME="$BOT_ROOT_NAME$N" - - systemctl daemon-reload - service buildslave restart - setup_numbered_bot "$BOT_NAME" "$BOT_DIR" - - systemctl daemon-reload - service buildslave restart - - chown -R buildbot $BOT_DIR/ - sudo -u buildbot /usr/bin/buildslave start $BOT_DIR/ - - sleep 30 - cat $BOT_DIR/twistd.log - if grep --quiet "slave is ready" $BOT_DIR/twistd.log; then - return 0 - fi - if grep --quiet "configuration update complete" $BOT_DIR/twistd.log; then - return 0 - fi - if grep "rejecting duplicate slave" $BOT_DIR/twistd.log; then - return 1 - fi - echo "Unknown error" - cat $BOT_DIR/twistd.log - exit 1 -} - -for N in `shuf -i 1-5` -do - if try_start_builder $N; then - break - fi - echo "failed to start any buildbot" - shutdown now -done - -# GCE can restart instance after 24h in the middle of the build. -# Gracefully restart before that happen. -sleep 72000 -while pkill -SIGHUP buildslave; do sleep 5; done; -shutdown now - -- GitLab From 64bb3759dda5191bd51c4b8114eca12a24b9a5f2 Mon Sep 17 00:00:00 2001 From: Kristof Beyls Date: Wed, 17 Mar 2021 17:55:56 +0100 Subject: [PATCH 0064/1000] [docs] Document regular LLVM sync-ups This documents current regular LLVM sync-ups that are happening in the Getting Involved section. I hope this gives a bit more visibility to regular sync-ups that are happening in the LLVM community, documenting another way communication in the community happens. Of course the downside is that this is another location that sync-up metadata needs to be maintained. That being said, the structure as proposed means that no changes are needed once a new sync-up is added, apart from maybe removing the entry once it becomes clear that that particular sync-up series is completely cancelled. Documenting a few pointers on how current sync-ups happen may also encourage others to organize useful sync-ups on specific topics. I've started with adding the sync-ups I'm aware of. There's a good chance I've missed some. If most sync-ups end up having a public google calendar, we could also create and maintain a public google calendar that shows all events happening in the LLVM community, including dev meetings, sync-ups, socials, etc - assuming that would be valuable. Differential Revision: https://reviews.llvm.org/D98797 --- llvm/docs/GettingInvolved.rst | 46 +++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index 3b6e14e5840e..c6856cc77eb1 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -135,6 +135,52 @@ lists. .. __: http://lists.llvm.org/mailman/listinfo/llvm-announce +Online Sync-Ups +--------------- + +A number of regular calls are organized on specific topics. It should be +expected that the range of topics will change over time. At the time of +writing, the following sync-ups are organized: + +.. list-table:: LLVM regular sync-up calls + :widths: 25 25 25 25 + :header-rows: 1 + + * - Topic + - Frequency + - Calendar link + - Minutes/docs link + * - RISC-V + - Every 2 weeks on Thursday + - `ics `__ + `gcal `__ + - + * - Scalable Vectors and Arm SVE + - Monthly, every 3rd Tuesday + - + - `Minutes/docs `__ + * - ML Guided Compiler Optimizations + - Monthly + - + - `Minutes/docs `__ + * - `LLVM security group `__ + - Monthly, every 3rd Tuesday + - `ics `__ + `gcal `__ + - `Minutes/docs `__ + * - `CIRCT `__ + - Weekly, on Wednesday + - + - `Minutes/docs `__ + * - `MLIR `__ design meetings + - Weekly, on Thursdays + - + - `Minutes/docs `__ + * - flang and openmp + - Multiple meeting series, `documented here `__ + - + - + IRC --- -- GitLab From eb37d3546cd0c6e67798496634c45e501f7806f1 Mon Sep 17 00:00:00 2001 From: Arthur O'Dwyer Date: Thu, 18 Mar 2021 10:28:56 -0400 Subject: [PATCH 0065/1000] [libc++] Future-proof generate_feature_test_macro_components.py against long names. `__cpp_lib_default_template_type_for_algorithm_values` is 52 characters long, which is enough to reduce the multiplier to less-than-zero, producing an empty string between the name of the macro and its numeric value. Ensure there's always a space between the name of the macro and its value. Differential Revision: https://reviews.llvm.org/D98869 --- libcxx/utils/generate_feature_test_macro_components.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index ce0007610b08..e69c7c1f9442 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -720,7 +720,7 @@ def get_std_number(std): def produce_macros_definition_for_std(std): result = "" - indent = 56 + indent = 55 for tc in feature_test_macros: if std not in tc["values"]: continue @@ -734,7 +734,7 @@ def produce_macros_definition_for_std(std): result += "# undef %s\n" % tc["name"] line = "#%sdefine %s" % ((" " * inner_indent), tc["name"]) line += " " * (indent - len(line)) - line += "%sL" % tc["values"][std] + line += " %sL" % tc["values"][std] if 'unimplemented' in tc.keys(): line = "// " + line result += line -- GitLab From 6359049c35042adb34ffe6ba77008613c1436ee1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20B=C3=B6ck?= Date: Thu, 18 Mar 2021 18:51:10 +0100 Subject: [PATCH 0066/1000] [CMake][runtimes] Add file level dependency to merge_archives commands Both libc++ and libc++abi have options of merging with another archive. In the case of libc++abi, libunwind can be merged into it and in the case of libc++, libc++abi can be merged into it. This is realized using add_custom_command with POST_BUILD and the usage of the CMake generator expression TARGET_LINKER_FILE in the arguments. For such generator expressions CMake doc states: "This target-level dependency does NOT add a file-level dependency that would cause the custom command to re-run whenever the executable is recompiled" [1] This patch adds a DEPENDS argument to both add_custom_command invocations so that the archives also have a file-level dependency on the target they are merging with. That way, changes in say, libunwind source code, will be updated in the libc++abi and/or libc++ static libraries as well. [1] https://cmake.org/cmake/help/v3.20/command/add_custom_command.html Differential Revision: https://reviews.llvm.org/D98129 --- libcxx/src/CMakeLists.txt | 4 ++++ libcxxabi/src/CMakeLists.txt | 1 + 2 files changed, 5 insertions(+) diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index 5a59b58d4363..2afc69be37b8 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -299,6 +299,9 @@ if (LIBCXX_ENABLE_STATIC) else() set(MERGE_ARCHIVES_ABI_TARGET "${CMAKE_STATIC_LIBRARY_PREFIX}${LIBCXX_CXX_STATIC_ABI_LIBRARY}${CMAKE_STATIC_LIBRARY_SUFFIX}") + if (LIBCXX_CXX_ABI_LIBRARY_PATH) + set(MERGE_ARCHIVES_ABI_TARGET "${LIBCXX_CXX_ABI_LIBRARY_PATH}/${MERGE_ARCHIVES_ABI_TARGET}") + endif () endif() if (APPLE) set(MERGE_ARCHIVES_LIBTOOL "--use-libtool" "--libtool" "${CMAKE_LIBTOOL}") @@ -314,6 +317,7 @@ if (LIBCXX_ENABLE_STATIC) "${MERGE_ARCHIVES_ABI_TARGET}" "${MERGE_ARCHIVES_SEARCH_PATHS}" WORKING_DIRECTORY ${LIBCXX_BUILD_DIR} + DEPENDS ${MERGE_ARCHIVES_ABI_TARGET} ) endif() endif() diff --git a/libcxxabi/src/CMakeLists.txt b/libcxxabi/src/CMakeLists.txt index 50afdf6890a3..ea8c54589006 100644 --- a/libcxxabi/src/CMakeLists.txt +++ b/libcxxabi/src/CMakeLists.txt @@ -302,6 +302,7 @@ if (LIBCXXABI_ENABLE_STATIC) "$" "$" WORKING_DIRECTORY ${LIBCXXABI_BUILD_DIR} + DEPENDS unwind_static ) endif() endif() -- GitLab From 858ca7c174761248ff888a8435059317a7fe1116 Mon Sep 17 00:00:00 2001 From: Jorg Brown Date: Thu, 18 Mar 2021 11:00:07 -0700 Subject: [PATCH 0067/1000] Fix typo: `char` should be `TS` --- compiler-rt/include/fuzzer/FuzzedDataProvider.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/compiler-rt/include/fuzzer/FuzzedDataProvider.h index 6cbfc39bc20b..71cb427ec4a9 100644 --- a/compiler-rt/include/fuzzer/FuzzedDataProvider.h +++ b/compiler-rt/include/fuzzer/FuzzedDataProvider.h @@ -390,7 +390,7 @@ TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) { return static_cast(value); } else { constexpr auto TS_min = std::numeric_limits::min(); - return TS_min + static_cast(value - TS_min); + return TS_min + static_cast(value - TS_min); } } -- GitLab From 4c782a24d901b6317599c98f59161e6e0b5cc244 Mon Sep 17 00:00:00 2001 From: lorenzo chelini Date: Thu, 18 Mar 2021 19:15:33 +0100 Subject: [PATCH 0068/1000] [mlir] Fix typo in SCF.cpp (NFC) --- mlir/lib/Dialect/SCF/SCF.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp index 8def7a0c6e7e..fdb9df82900c 100644 --- a/mlir/lib/Dialect/SCF/SCF.cpp +++ b/mlir/lib/Dialect/SCF/SCF.cpp @@ -254,7 +254,7 @@ ForOp mlir::scf::getForInductionVarOwner(Value val) { } /// Return operands used when entering the region at 'index'. These operands -/// correspond to the loop iterator operands, i.e., those exclusing the +/// correspond to the loop iterator operands, i.e., those excluding the /// induction variable. LoopOp only has one region, so 0 is the only valid value /// for `index`. OperandRange ForOp::getSuccessorEntryOperands(unsigned index) { -- GitLab From 0d8331c06be6981da5341bdcbbc6dd867002da08 Mon Sep 17 00:00:00 2001 From: peter klausler Date: Thu, 18 Mar 2021 10:26:23 -0700 Subject: [PATCH 0069/1000] [flang] Refine symbol sorting Replace semantics::SymbolSet with alternatives that clarify whether the set should order its contents by source position or not. This matters because positionally-ordered sets must not be used for Symbols that might be subjected to name replacement during name resolution, and address-ordered sets must not be used (without sorting) in circumstances where the order of their contents affects the output of the compiler. All set<> and map<> instances in the compiler that are keyed by Symbols now have explicit Compare types in their template instantiations. Symbol::operator< is no more. Differential Revision: https://reviews.llvm.org/D98878 --- flang/include/flang/Evaluate/constant.h | 7 +++- flang/include/flang/Evaluate/tools.h | 10 +++-- flang/include/flang/Semantics/semantics.h | 5 ++- flang/include/flang/Semantics/symbol.h | 46 +++++++++++++++------ flang/lib/Evaluate/characteristics.cpp | 25 ++++++----- flang/lib/Evaluate/constant.cpp | 4 ++ flang/lib/Evaluate/tools.cpp | 31 +++++++++----- flang/lib/Parser/provenance.cpp | 11 +++-- flang/lib/Semantics/check-declarations.cpp | 3 +- flang/lib/Semantics/check-do-forall.cpp | 38 +++++++++-------- flang/lib/Semantics/check-omp-structure.cpp | 2 +- flang/lib/Semantics/compute-offsets.cpp | 5 ++- flang/lib/Semantics/mod-file.cpp | 7 ++-- flang/lib/Semantics/resolve-directives.cpp | 6 +-- flang/lib/Semantics/resolve-names.cpp | 4 +- flang/lib/Semantics/scope.cpp | 2 +- flang/test/Semantics/resolve102.f90 | 3 -- 17 files changed, 124 insertions(+), 85 deletions(-) diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h index 89a5867722f7..d5f7db8be45d 100644 --- a/flang/include/flang/Evaluate/constant.h +++ b/flang/include/flang/Evaluate/constant.h @@ -195,8 +195,11 @@ private: }; class StructureConstructor; -using StructureConstructorValues = - std::map>>; +struct ComponentCompare { + bool operator()(SymbolRef x, SymbolRef y) const; +}; +using StructureConstructorValues = std::map>, ComponentCompare>; template <> class Constant diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h index afa70fd0099a..4a0a4dcf4041 100644 --- a/flang/include/flang/Evaluate/tools.h +++ b/flang/include/flang/Evaluate/tools.h @@ -839,10 +839,12 @@ template SymbolVector GetSymbolVector(const A &x) { const Symbol *GetLastTarget(const SymbolVector &); // Collects all of the Symbols in an expression -template semantics::SymbolSet CollectSymbols(const A &); -extern template semantics::SymbolSet CollectSymbols(const Expr &); -extern template semantics::SymbolSet CollectSymbols(const Expr &); -extern template semantics::SymbolSet CollectSymbols( +template semantics::UnorderedSymbolSet CollectSymbols(const A &); +extern template semantics::UnorderedSymbolSet CollectSymbols( + const Expr &); +extern template semantics::UnorderedSymbolSet CollectSymbols( + const Expr &); +extern template semantics::UnorderedSymbolSet CollectSymbols( const Expr &); // Predicate: does a variable contain a vector-valued subscript (not a triplet)? diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h index e6202c666429..3ef0cafa872a 100644 --- a/flang/include/flang/Semantics/semantics.h +++ b/flang/include/flang/Semantics/semantics.h @@ -198,8 +198,9 @@ private: parser::CharBlock location; IndexVarKind kind; }; - std::map activeIndexVars_; - SymbolSet errorSymbols_; + std::map + activeIndexVars_; + UnorderedSymbolSet errorSymbols_; std::set tempNames_; }; diff --git a/flang/include/flang/Semantics/symbol.h b/flang/include/flang/Semantics/symbol.h index 957bffdb4833..0078d2567473 100644 --- a/flang/include/flang/Semantics/symbol.h +++ b/flang/include/flang/Semantics/symbol.h @@ -596,13 +596,6 @@ public: bool operator==(const Symbol &that) const { return this == &that; } bool operator!=(const Symbol &that) const { return !(*this == that); } - // Symbol comparison is based on the order of cooked source - // stream creation and, when both are from the same cooked source, - // their positions in that cooked source stream. - // (This function is implemented in Evaluate/tools.cpp to - // satisfy complicated shared library interdependency.) - bool operator<(const Symbol &) const; - int Rank() const { return std::visit( common::visitors{ @@ -767,13 +760,40 @@ inline const DeclTypeSpec *Symbol::GetType() const { details_); } -inline bool operator<(SymbolRef x, SymbolRef y) { - return *x < *y; // name source position ordering -} -inline bool operator<(MutableSymbolRef x, MutableSymbolRef y) { - return *x < *y; // name source position ordering +// Sets and maps keyed by Symbols + +struct SymbolAddressCompare { + bool operator()(const SymbolRef &x, const SymbolRef &y) const { + return &*x < &*y; + } + bool operator()(const MutableSymbolRef &x, const MutableSymbolRef &y) const { + return &*x < &*y; + } +}; + +// Symbol comparison is based on the order of cooked source +// stream creation and, when both are from the same cooked source, +// their positions in that cooked source stream. +// Don't use this comparator or OrderedSymbolSet to hold +// Symbols that might be subject to ReplaceName(). +struct SymbolSourcePositionCompare { + // These functions are implemented in Evaluate/tools.cpp to + // satisfy complicated shared library interdependency. + bool operator()(const SymbolRef &, const SymbolRef &) const; + bool operator()(const MutableSymbolRef &, const MutableSymbolRef &) const; +}; + +using UnorderedSymbolSet = std::set; +using OrderedSymbolSet = std::set; + +template +OrderedSymbolSet OrderBySourcePosition(const A &container) { + OrderedSymbolSet result; + for (SymbolRef x : container) { + result.emplace(x); + } + return result; } -using SymbolSet = std::set; } // namespace Fortran::semantics diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp index 4d5436a9776c..c6d6afeb81d1 100644 --- a/flang/lib/Evaluate/characteristics.cpp +++ b/flang/lib/Evaluate/characteristics.cpp @@ -343,30 +343,29 @@ bool DummyProcedure::operator==(const DummyProcedure &that) const { procedure.value() == that.procedure.value(); } -static std::string GetSeenProcs(const semantics::SymbolSet &seenProcs) { +static std::string GetSeenProcs( + const semantics::UnorderedSymbolSet &seenProcs) { // Sort the symbols so that they appear in the same order on all platforms - std::vector sorter{seenProcs.begin(), seenProcs.end()}; - std::sort(sorter.begin(), sorter.end()); - + auto ordered{semantics::OrderBySourcePosition(seenProcs)}; std::string result; llvm::interleave( - sorter, + ordered, [&](const SymbolRef p) { result += '\'' + p->name().ToString() + '\''; }, [&]() { result += ", "; }); return result; } -// These functions with arguments of type SymbolSet are used with mutually -// recursive calls when characterizing a Procedure, a DummyArgument, or a -// DummyProcedure to detect circularly defined procedures as required by +// These functions with arguments of type UnorderedSymbolSet are used with +// mutually recursive calls when characterizing a Procedure, a DummyArgument, +// or a DummyProcedure to detect circularly defined procedures as required by // 15.4.3.6, paragraph 2. static std::optional CharacterizeDummyArgument( const semantics::Symbol &symbol, FoldingContext &context, - semantics::SymbolSet &seenProcs); + semantics::UnorderedSymbolSet &seenProcs); static std::optional CharacterizeProcedure( const semantics::Symbol &original, FoldingContext &context, - semantics::SymbolSet &seenProcs) { + semantics::UnorderedSymbolSet &seenProcs) { Procedure result; const auto &symbol{original.GetUltimate()}; if (seenProcs.find(symbol) != seenProcs.end()) { @@ -475,7 +474,7 @@ static std::optional CharacterizeProcedure( static std::optional CharacterizeDummyProcedure( const semantics::Symbol &symbol, FoldingContext &context, - semantics::SymbolSet &seenProcs) { + semantics::UnorderedSymbolSet &seenProcs) { if (auto procedure{CharacterizeProcedure(symbol, context, seenProcs)}) { // Dummy procedures may not be elemental. Elemental dummy procedure // interfaces are errors when the interface is not intrinsic, and that @@ -516,7 +515,7 @@ bool DummyArgument::operator==(const DummyArgument &that) const { static std::optional CharacterizeDummyArgument( const semantics::Symbol &symbol, FoldingContext &context, - semantics::SymbolSet &seenProcs) { + semantics::UnorderedSymbolSet &seenProcs) { auto name{symbol.name().ToString()}; if (symbol.has()) { if (auto obj{DummyDataObject::Characterize(symbol, context)}) { @@ -779,7 +778,7 @@ bool Procedure::CanOverride( std::optional Procedure::Characterize( const semantics::Symbol &original, FoldingContext &context) { - semantics::SymbolSet seenProcs; + semantics::UnorderedSymbolSet seenProcs; return CharacterizeProcedure(original, context, seenProcs); } diff --git a/flang/lib/Evaluate/constant.cpp b/flang/lib/Evaluate/constant.cpp index 5b73979f1e2c..8f30ca081162 100644 --- a/flang/lib/Evaluate/constant.cpp +++ b/flang/lib/Evaluate/constant.cpp @@ -315,5 +315,9 @@ std::size_t Constant::CopyFrom(const Constant &source, return Base::CopyFrom(source, count, resultSubscripts, dimOrder); } +bool ComponentCompare::operator()(SymbolRef x, SymbolRef y) const { + return semantics::SymbolSourcePositionCompare{}(x, y); +} + INSTANTIATE_CONSTANT_TEMPLATES } // namespace Fortran::evaluate diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp index 638b7941c9e8..9fbf21e43b72 100644 --- a/flang/lib/Evaluate/tools.cpp +++ b/flang/lib/Evaluate/tools.cpp @@ -782,20 +782,22 @@ const Symbol *GetLastTarget(const SymbolVector &symbols) { } struct CollectSymbolsHelper - : public SetTraverse { - using Base = SetTraverse; + : public SetTraverse { + using Base = SetTraverse; CollectSymbolsHelper() : Base{*this} {} using Base::operator(); - semantics::SymbolSet operator()(const Symbol &symbol) const { + semantics::UnorderedSymbolSet operator()(const Symbol &symbol) const { return {symbol}; } }; -template semantics::SymbolSet CollectSymbols(const A &x) { +template semantics::UnorderedSymbolSet CollectSymbols(const A &x) { return CollectSymbolsHelper{}(x); } -template semantics::SymbolSet CollectSymbols(const Expr &); -template semantics::SymbolSet CollectSymbols(const Expr &); -template semantics::SymbolSet CollectSymbols(const Expr &); +template semantics::UnorderedSymbolSet CollectSymbols(const Expr &); +template semantics::UnorderedSymbolSet CollectSymbols( + const Expr &); +template semantics::UnorderedSymbolSet CollectSymbols( + const Expr &); // HasVectorSubscript() struct HasVectorSubscriptHelper : public AnyTraverse { @@ -1177,7 +1179,7 @@ const Symbol &GetUsedModule(const UseDetails &details) { } static const Symbol *FindFunctionResult( - const Symbol &original, SymbolSet &seen) { + const Symbol &original, UnorderedSymbolSet &seen) { const Symbol &root{GetAssociationRoot(original)}; ; if (!seen.insert(root).second) { @@ -1199,7 +1201,7 @@ static const Symbol *FindFunctionResult( } const Symbol *FindFunctionResult(const Symbol &symbol) { - SymbolSet seen; + UnorderedSymbolSet seen; return FindFunctionResult(symbol, seen); } @@ -1207,8 +1209,15 @@ const Symbol *FindFunctionResult(const Symbol &symbol) { // them; they cannot be defined in symbol.h due to the dependence // on Scope. -bool Symbol::operator<(const Symbol &that) const { - return GetSemanticsContext().allCookedSources().Precedes(name_, that.name_); +bool SymbolSourcePositionCompare::operator()( + const SymbolRef &x, const SymbolRef &y) const { + return x->GetSemanticsContext().allCookedSources().Precedes( + x->name(), y->name()); +} +bool SymbolSourcePositionCompare::operator()( + const MutableSymbolRef &x, const MutableSymbolRef &y) const { + return x->GetSemanticsContext().allCookedSources().Precedes( + x->name(), y->name()); } SemanticsContext &Symbol::GetSemanticsContext() const { diff --git a/flang/lib/Parser/provenance.cpp b/flang/lib/Parser/provenance.cpp index 79cb28615b95..2aa1a97ce557 100644 --- a/flang/lib/Parser/provenance.cpp +++ b/flang/lib/Parser/provenance.cpp @@ -602,16 +602,15 @@ void AllCookedSources::Dump(llvm::raw_ostream &o) const { } bool AllCookedSources::Precedes(CharBlock x, CharBlock y) const { - const CookedSource *ySource{Find(y)}; if (const CookedSource * xSource{Find(x)}) { - if (ySource) { - int xNum{xSource->number()}; - int yNum{ySource->number()}; - return xNum < yNum || (xNum == yNum && x.begin() < y.begin()); + if (xSource->AsCharBlock().Contains(y)) { + return x.begin() < y.begin(); + } else if (const CookedSource * ySource{Find(y)}) { + return xSource->number() < ySource->number(); } else { return true; // by fiat, all cooked source < anything outside } - } else if (ySource) { + } else if (Find(y)) { return false; } else { // Both names are compiler-created (SaveTempName). diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index ebc0bcf606b5..0dad3c6e8d9b 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -110,7 +110,8 @@ private: // that has a symbol. const Symbol *innermostSymbol_{nullptr}; // Cache of calls to Procedure::Characterize(Symbol) - std::map> characterizeCache_; + std::map, SymbolAddressCompare> + characterizeCache_; }; class DistinguishabilityHelper { diff --git a/flang/lib/Semantics/check-do-forall.cpp b/flang/lib/Semantics/check-do-forall.cpp index d2f55eed539c..1532dea61ac5 100644 --- a/flang/lib/Semantics/check-do-forall.cpp +++ b/flang/lib/Semantics/check-do-forall.cpp @@ -548,9 +548,9 @@ private: // the names up in the scope that encloses the DO construct to avoid getting // the local versions of them. Then follow the host-, use-, and // construct-associations to get the root symbols - SymbolSet GatherLocals( + UnorderedSymbolSet GatherLocals( const std::list &localitySpecs) const { - SymbolSet symbols; + UnorderedSymbolSet symbols; const Scope &parentScope{ context_.FindScope(currentStatementSourcePosition_).parent()}; // Loop through the LocalitySpec::Local locality-specs @@ -568,8 +568,9 @@ private: return symbols; } - static SymbolSet GatherSymbolsFromExpression(const parser::Expr &expression) { - SymbolSet result; + static UnorderedSymbolSet GatherSymbolsFromExpression( + const parser::Expr &expression) { + UnorderedSymbolSet result; if (const auto *expr{GetExpr(expression)}) { for (const Symbol &symbol : evaluate::CollectSymbols(*expr)) { result.insert(ResolveAssociations(symbol)); @@ -580,8 +581,9 @@ private: // C1121 - procedures in mask must be pure void CheckMaskIsPure(const parser::ScalarLogicalExpr &mask) const { - SymbolSet references{GatherSymbolsFromExpression(mask.thing.thing.value())}; - for (const Symbol &ref : references) { + UnorderedSymbolSet references{ + GatherSymbolsFromExpression(mask.thing.thing.value())}; + for (const Symbol &ref : OrderBySourcePosition(references)) { if (IsProcedure(ref) && !IsPureProcedure(ref)) { context_.SayWithDecl(ref, parser::Unwrap(mask)->source, "%s mask expression may not reference impure procedure '%s'"_err_en_US, @@ -591,10 +593,10 @@ private: } } - void CheckNoCollisions(const SymbolSet &refs, const SymbolSet &uses, - parser::MessageFixedText &&errorMessage, + void CheckNoCollisions(const UnorderedSymbolSet &refs, + const UnorderedSymbolSet &uses, parser::MessageFixedText &&errorMessage, const parser::CharBlock &refPosition) const { - for (const Symbol &ref : refs) { + for (const Symbol &ref : OrderBySourcePosition(refs)) { if (uses.find(ref) != uses.end()) { context_.SayWithDecl(ref, refPosition, std::move(errorMessage), LoopKindName(), ref.name()); @@ -603,8 +605,8 @@ private: } } - void HasNoReferences( - const SymbolSet &indexNames, const parser::ScalarIntExpr &expr) const { + void HasNoReferences(const UnorderedSymbolSet &indexNames, + const parser::ScalarIntExpr &expr) const { CheckNoCollisions(GatherSymbolsFromExpression(expr.thing.thing.value()), indexNames, "%s limit expression may not reference index variable '%s'"_err_en_US, @@ -612,8 +614,8 @@ private: } // C1129, names in local locality-specs can't be in mask expressions - void CheckMaskDoesNotReferenceLocal( - const parser::ScalarLogicalExpr &mask, const SymbolSet &localVars) const { + void CheckMaskDoesNotReferenceLocal(const parser::ScalarLogicalExpr &mask, + const UnorderedSymbolSet &localVars) const { CheckNoCollisions(GatherSymbolsFromExpression(mask.thing.thing.value()), localVars, "%s mask expression references variable '%s'" @@ -623,8 +625,8 @@ private: // C1129, names in local locality-specs can't be in limit or step // expressions - void CheckExprDoesNotReferenceLocal( - const parser::ScalarIntExpr &expr, const SymbolSet &localVars) const { + void CheckExprDoesNotReferenceLocal(const parser::ScalarIntExpr &expr, + const UnorderedSymbolSet &localVars) const { CheckNoCollisions(GatherSymbolsFromExpression(expr.thing.thing.value()), localVars, "%s expression references variable '%s'" @@ -663,7 +665,7 @@ private: CheckMaskIsPure(*mask); } auto &controls{std::get>(header.t)}; - SymbolSet indexNames; + UnorderedSymbolSet indexNames; for (const parser::ConcurrentControl &control : controls) { const auto &indexName{std::get(control.t)}; if (indexName.symbol) { @@ -697,7 +699,7 @@ private: const auto &localitySpecs{ std::get>(concurrent.t)}; if (!localitySpecs.empty()) { - const SymbolSet &localVars{GatherLocals(localitySpecs)}; + const UnorderedSymbolSet &localVars{GatherLocals(localitySpecs)}; for (const auto &c : GetControls(control)) { CheckExprDoesNotReferenceLocal(std::get<1>(c.t), localVars); CheckExprDoesNotReferenceLocal(std::get<2>(c.t), localVars); @@ -733,7 +735,7 @@ private: void CheckForallIndexesUsed(const evaluate::Assignment &assignment) { SymbolVector indexVars{context_.GetIndexVars(IndexVarKind::FORALL)}; if (!indexVars.empty()) { - SymbolSet symbols{evaluate::CollectSymbols(assignment.lhs)}; + UnorderedSymbolSet symbols{evaluate::CollectSymbols(assignment.lhs)}; std::visit( common::visitors{ [&](const evaluate::Assignment::BoundsSpec &spec) { diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 269e64919a6a..a3a3fd5d3524 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -630,7 +630,7 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) { } } // A list-item cannot appear in more than one aligned clause - semantics::SymbolSet alignedVars; + semantics::UnorderedSymbolSet alignedVars; auto clauseAll = FindClauses(llvm::omp::Clause::OMPC_aligned); for (auto itr = clauseAll.first; itr != clauseAll.second; ++itr) { const auto &alignedClause{ diff --git a/flang/lib/Semantics/compute-offsets.cpp b/flang/lib/Semantics/compute-offsets.cpp index bb2f4d98a17d..4b1538ca785f 100644 --- a/flang/lib/Semantics/compute-offsets.cpp +++ b/flang/lib/Semantics/compute-offsets.cpp @@ -58,9 +58,10 @@ private: std::size_t offset_{0}; std::size_t alignment_{1}; // symbol -> symbol+offset that determines its location, from EQUIVALENCE - std::map dependents_; + std::map dependents_; // base symbol -> SizeAndAlignment for each distinct EQUIVALENCE block - std::map equivalenceBlock_; + std::map + equivalenceBlock_; }; void ComputeOffsetsHelper::Compute(Scope &scope) { diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp index c3b95a7836a8..1e2a5c6728b7 100644 --- a/flang/lib/Semantics/mod-file.cpp +++ b/flang/lib/Semantics/mod-file.cpp @@ -81,8 +81,8 @@ private: const Scope &scope_; bool isInterface_{false}; SymbolVector need_; // symbols that are needed - SymbolSet needSet_; // symbols already in need_ - SymbolSet useSet_; // use-associations that might be needed + UnorderedSymbolSet needSet_; // symbols already in need_ + UnorderedSymbolSet useSet_; // use-associations that might be needed std::set imports_; // imports from host that are needed void DoSymbol(const Symbol &); @@ -498,7 +498,8 @@ void CollectSymbols( for (const auto &pair : scope.commonBlocks()) { sorted.push_back(*pair.second); } - std::sort(sorted.end() - commonSize, sorted.end()); + std::sort( + sorted.end() - commonSize, sorted.end(), SymbolSourcePositionCompare{}); } void PutEntity(llvm::raw_ostream &os, const Symbol &symbol) { diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 8f12278f8559..d5ba6a12995d 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -105,7 +105,7 @@ protected: Symbol *DeclarePrivateAccessEntity(Symbol &, Symbol::Flag, Scope &); Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag); - SymbolSet dataSharingAttributeObjects_; // on one directive + UnorderedSymbolSet dataSharingAttributeObjects_; // on one directive SemanticsContext &context_; std::vector dirContext_; // used as a stack }; @@ -452,8 +452,8 @@ private: Symbol::Flag::OmpCopyIn, Symbol::Flag::OmpCopyPrivate}; std::vector allocateNames_; // on one directive - SymbolSet privateDataSharingAttributeObjects_; // on one directive - SymbolSet stmtFunctionExprSymbols_; + UnorderedSymbolSet privateDataSharingAttributeObjects_; // on one directive + UnorderedSymbolSet stmtFunctionExprSymbols_; std::multimap>> sourceLabels_; diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 813debbe1d86..398f45c5c4ae 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -2690,7 +2690,7 @@ void InterfaceVisitor::AddSpecificProcs( // this generic interface. Resolve those names to symbols. void InterfaceVisitor::ResolveSpecificsInGeneric(Symbol &generic) { auto &details{generic.get()}; - SymbolSet symbolsSeen; + UnorderedSymbolSet symbolsSeen; for (const Symbol &symbol : details.specificProcs()) { symbolsSeen.insert(symbol); } @@ -3651,7 +3651,7 @@ Symbol &DeclarationVisitor::DeclareUnknownEntity( bool DeclarationVisitor::HasCycle( const Symbol &procSymbol, const ProcInterface &interface) { - SymbolSet procsInCycle; + OrderedSymbolSet procsInCycle; procsInCycle.insert(procSymbol); const ProcInterface *thisInterface{&interface}; bool haveInterface{true}; diff --git a/flang/lib/Semantics/scope.cpp b/flang/lib/Semantics/scope.cpp index 2e2b8f77f16e..4faec3bd00cd 100644 --- a/flang/lib/Semantics/scope.cpp +++ b/flang/lib/Semantics/scope.cpp @@ -61,7 +61,7 @@ static std::vector> GetSortedSymbols( for (auto &pair : symbols) { result.push_back(*pair.second); } - std::sort(result.begin(), result.end()); + std::sort(result.begin(), result.end(), SymbolSourcePositionCompare{}); return result; } diff --git a/flang/test/Semantics/resolve102.f90 b/flang/test/Semantics/resolve102.f90 index 4f900a1309f3..c5b3f53bbdc4 100644 --- a/flang/test/Semantics/resolve102.f90 +++ b/flang/test/Semantics/resolve102.f90 @@ -68,7 +68,6 @@ program twoCycle !ERROR: The interface for procedure 'p1' is recursively defined !ERROR: The interface for procedure 'p2' is recursively defined procedure(p1) p2 - !ERROR: 'p2' must be an abstract interface or a procedure with an explicit interface procedure(p2) p1 call p1 call p2 @@ -76,10 +75,8 @@ end program program threeCycle !ERROR: The interface for procedure 'p1' is recursively defined - !ERROR: 'p1' must be an abstract interface or a procedure with an explicit interface !ERROR: The interface for procedure 'p2' is recursively defined procedure(p1) p2 - !ERROR: 'p2' must be an abstract interface or a procedure with an explicit interface !ERROR: The interface for procedure 'p3' is recursively defined procedure(p2) p3 procedure(p3) p1 -- GitLab From 2f2ae08da91dc5c188d5bb4d8b0b096d0a120a4a Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 18 Mar 2021 11:21:24 -0700 Subject: [PATCH 0070/1000] [WebAssembly] Remove experimental SIMD instructions Removes the instruction definitions, intrinsics, and builtins for qfma/qfms, signselect, and prefetch instructions, which were not included in the final WebAssembly SIMD spec. Depends on D98457. Differential Revision: https://reviews.llvm.org/D98466 --- .../clang/Basic/BuiltinsWebAssembly.def | 16 -- clang/lib/CodeGen/CGBuiltin.cpp | 64 ----- clang/test/CodeGen/builtins-wasm.c | 78 ------ llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 35 --- .../MCTargetDesc/WebAssemblyMCTargetDesc.h | 2 - .../WebAssembly/WebAssemblyISelLowering.cpp | 10 - .../WebAssembly/WebAssemblyInstrSIMD.td | 97 -------- .../CodeGen/WebAssembly/simd-intrinsics.ll | 117 --------- .../WebAssembly/simd-prefetch-offset.ll | 235 ------------------ llvm/test/MC/WebAssembly/simd-encodings.s | 36 --- 10 files changed, 690 deletions(-) delete mode 100644 llvm/test/CodeGen/WebAssembly/simd-prefetch-offset.ll diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index 38de66587cba..2f51376ba15a 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -141,11 +141,6 @@ TARGET_BUILTIN(__builtin_wasm_extadd_pairwise_i16x8_u_i32x4, "V4UiV8Us", "nc", " TARGET_BUILTIN(__builtin_wasm_bitselect, "V4iV4iV4iV4i", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_signselect_i8x16, "V16ScV16ScV16ScV16Sc", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_signselect_i16x8, "V8sV8sV8sV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_signselect_i32x4, "V4iV4iV4iV4i", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_signselect_i64x2, "V2LLiV2LLiV2LLiV2LLi", "nc", "simd128") - TARGET_BUILTIN(__builtin_wasm_shuffle_v8x16, "V16ScV16ScV16ScIiIiIiIiIiIiIiIiIiIiIiIiIiIiIiIi", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_any_true_i8x16, "iV16Sc", "nc", "simd128") @@ -188,11 +183,6 @@ TARGET_BUILTIN(__builtin_wasm_dot_s_i32x4_i16x8, "V4iV8sV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_sqrt_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_sqrt_f64x2, "V2dV2d", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_qfma_f32x4, "V4fV4fV4fV4f", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_qfms_f32x4, "V4fV4fV4fV4f", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_qfma_f64x2, "V2dV2dV2dV2d", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_qfms_f64x2, "V2dV2dV2dV2d", "nc", "simd128") - TARGET_BUILTIN(__builtin_wasm_trunc_saturate_s_i32x4_f32x4, "V4iV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_trunc_saturate_u_i32x4_f32x4, "V4iV4f", "nc", "simd128") @@ -206,9 +196,6 @@ TARGET_BUILTIN(__builtin_wasm_widen_high_s_i32x4_i64x2, "V2LLiV4i", "nc", "simd1 TARGET_BUILTIN(__builtin_wasm_widen_low_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_widen_high_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_s_i8x16_i32x4, "V4iV16ScIi", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_u_i8x16_i32x4, "V4UiV16UcIi", "nc", "simd128") - TARGET_BUILTIN(__builtin_wasm_convert_low_s_i32x4_f64x2, "V2dV4i", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_convert_low_u_i32x4_f64x2, "V2dV4Ui", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_trunc_saturate_zero_s_f64x2_i32x4, "V4iV2d", "nc", "simd128") @@ -230,8 +217,5 @@ TARGET_BUILTIN(__builtin_wasm_store64_lane, "vLLi*V2LLiIi", "n", "simd128") TARGET_BUILTIN(__builtin_wasm_eq_i64x2, "V2LLiV2LLiV2LLi", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_prefetch_t, "vv*", "n", "simd128") -TARGET_BUILTIN(__builtin_wasm_prefetch_nt, "vv*", "n", "simd128") - #undef BUILTIN #undef TARGET_BUILTIN diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 8d1d3c50870c..96df7b0d6222 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17366,17 +17366,6 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(Intrinsic::wasm_bitselect, ConvertType(E->getType())); return Builder.CreateCall(Callee, {V1, V2, C}); } - case WebAssembly::BI__builtin_wasm_signselect_i8x16: - case WebAssembly::BI__builtin_wasm_signselect_i16x8: - case WebAssembly::BI__builtin_wasm_signselect_i32x4: - case WebAssembly::BI__builtin_wasm_signselect_i64x2: { - Value *V1 = EmitScalarExpr(E->getArg(0)); - Value *V2 = EmitScalarExpr(E->getArg(1)); - Value *C = EmitScalarExpr(E->getArg(2)); - Function *Callee = - CGM.getIntrinsic(Intrinsic::wasm_signselect, ConvertType(E->getType())); - return Builder.CreateCall(Callee, {V1, V2, C}); - } case WebAssembly::BI__builtin_wasm_dot_s_i32x4_i16x8: { Value *LHS = EmitScalarExpr(E->getArg(0)); Value *RHS = EmitScalarExpr(E->getArg(1)); @@ -17444,29 +17433,6 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, Function *Callee = CGM.getIntrinsic(Intrinsic::sqrt, Vec->getType()); return Builder.CreateCall(Callee, {Vec}); } - case WebAssembly::BI__builtin_wasm_qfma_f32x4: - case WebAssembly::BI__builtin_wasm_qfms_f32x4: - case WebAssembly::BI__builtin_wasm_qfma_f64x2: - case WebAssembly::BI__builtin_wasm_qfms_f64x2: { - Value *A = EmitScalarExpr(E->getArg(0)); - Value *B = EmitScalarExpr(E->getArg(1)); - Value *C = EmitScalarExpr(E->getArg(2)); - unsigned IntNo; - switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_qfma_f32x4: - case WebAssembly::BI__builtin_wasm_qfma_f64x2: - IntNo = Intrinsic::wasm_qfma; - break; - case WebAssembly::BI__builtin_wasm_qfms_f32x4: - case WebAssembly::BI__builtin_wasm_qfms_f64x2: - IntNo = Intrinsic::wasm_qfms; - break; - default: - llvm_unreachable("unexpected builtin ID"); - } - Function *Callee = CGM.getIntrinsic(IntNo, A->getType()); - return Builder.CreateCall(Callee, {A, B, C}); - } case WebAssembly::BI__builtin_wasm_narrow_s_i8x16_i16x8: case WebAssembly::BI__builtin_wasm_narrow_u_i8x16_i16x8: case WebAssembly::BI__builtin_wasm_narrow_s_i16x8_i32x4: @@ -17515,26 +17481,6 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, Function *Callee = CGM.getIntrinsic(IntNo); return Builder.CreateCall(Callee, Vec); } - case WebAssembly::BI__builtin_wasm_widen_s_i8x16_i32x4: - case WebAssembly::BI__builtin_wasm_widen_u_i8x16_i32x4: { - Value *Vec = EmitScalarExpr(E->getArg(0)); - llvm::APSInt SubVecConst = - *E->getArg(1)->getIntegerConstantExpr(getContext()); - Value *SubVec = llvm::ConstantInt::get(getLLVMContext(), SubVecConst); - unsigned IntNo; - switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_widen_s_i8x16_i32x4: - IntNo = Intrinsic::wasm_widen_signed; - break; - case WebAssembly::BI__builtin_wasm_widen_u_i8x16_i32x4: - IntNo = Intrinsic::wasm_widen_unsigned; - break; - default: - llvm_unreachable("unexpected builtin ID"); - } - Function *Callee = CGM.getIntrinsic(IntNo); - return Builder.CreateCall(Callee, {Vec, SubVec}); - } case WebAssembly::BI__builtin_wasm_convert_low_s_i32x4_f64x2: case WebAssembly::BI__builtin_wasm_convert_low_u_i32x4_f64x2: { Value *Vec = EmitScalarExpr(E->getArg(0)); @@ -17649,16 +17595,6 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_shuffle); return Builder.CreateCall(Callee, Ops); } - case WebAssembly::BI__builtin_wasm_prefetch_t: { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_prefetch_t); - return Builder.CreateCall(Callee, Ptr); - } - case WebAssembly::BI__builtin_wasm_prefetch_nt: { - Value *Ptr = EmitScalarExpr(E->getArg(0)); - Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_prefetch_nt); - return Builder.CreateCall(Callee, Ptr); - } default: return nullptr; } diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index 124b09633693..71816ceda469 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -644,34 +644,6 @@ i32x4 bitselect(i32x4 x, i32x4 y, i32x4 c) { // WEBASSEMBLY-NEXT: ret } -i8x16 signselect_i8x16(i8x16 x, i8x16 y, i8x16 c) { - return __builtin_wasm_signselect_i8x16(x, y, c); - // WEBASSEMBLY: call <16 x i8> @llvm.wasm.signselect.v16i8( - // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y, <16 x i8> %c) - // WEBASSEMBLY-NEXT: ret -} - -i16x8 signselect_i16x8(i16x8 x, i16x8 y, i16x8 c) { - return __builtin_wasm_signselect_i16x8(x, y, c); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.signselect.v8i16( - // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y, <8 x i16> %c) - // WEBASSEMBLY-NEXT: ret -} - -i32x4 signselect_i32x4(i32x4 x, i32x4 y, i32x4 c) { - return __builtin_wasm_signselect_i32x4(x, y, c); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.signselect.v4i32( - // WEBASSEMBLY-SAME: <4 x i32> %x, <4 x i32> %y, <4 x i32> %c) - // WEBASSEMBLY-NEXT: ret -} - -i64x2 signselect_i64x2(i64x2 x, i64x2 y, i64x2 c) { - return __builtin_wasm_signselect_i64x2(x, y, c); - // WEBASSEMBLY: call <2 x i64> @llvm.wasm.signselect.v2i64( - // WEBASSEMBLY-SAME: <2 x i64> %x, <2 x i64> %y, <2 x i64> %c) - // WEBASSEMBLY-NEXT: ret -} - i8x16 popcnt(i8x16 x) { return __builtin_wasm_popcnt_i8x16(x); // WEBASSEMBLY: call <16 x i8> @llvm.wasm.popcnt(<16 x i8> %x) @@ -884,34 +856,6 @@ f64x2 sqrt_f64x2(f64x2 x) { // WEBASSEMBLY: ret } -f32x4 qfma_f32x4(f32x4 a, f32x4 b, f32x4 c) { - return __builtin_wasm_qfma_f32x4(a, b, c); - // WEBASSEMBLY: call <4 x float> @llvm.wasm.qfma.v4f32( - // WEBASSEMBLY-SAME: <4 x float> %a, <4 x float> %b, <4 x float> %c) - // WEBASSEMBLY-NEXT: ret -} - -f32x4 qfms_f32x4(f32x4 a, f32x4 b, f32x4 c) { - return __builtin_wasm_qfms_f32x4(a, b, c); - // WEBASSEMBLY: call <4 x float> @llvm.wasm.qfms.v4f32( - // WEBASSEMBLY-SAME: <4 x float> %a, <4 x float> %b, <4 x float> %c) - // WEBASSEMBLY-NEXT: ret -} - -f64x2 qfma_f64x2(f64x2 a, f64x2 b, f64x2 c) { - return __builtin_wasm_qfma_f64x2(a, b, c); - // WEBASSEMBLY: call <2 x double> @llvm.wasm.qfma.v2f64( - // WEBASSEMBLY-SAME: <2 x double> %a, <2 x double> %b, <2 x double> %c) - // WEBASSEMBLY-NEXT: ret -} - -f64x2 qfms_f64x2(f64x2 a, f64x2 b, f64x2 c) { - return __builtin_wasm_qfms_f64x2(a, b, c); - // WEBASSEMBLY: call <2 x double> @llvm.wasm.qfms.v2f64( - // WEBASSEMBLY-SAME: <2 x double> %a, <2 x double> %b, <2 x double> %c) - // WEBASSEMBLY-NEXT: ret -} - i32x4 trunc_saturate_s_i32x4_f32x4(f32x4 f) { return __builtin_wasm_trunc_saturate_s_i32x4_f32x4(f); // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.saturate.signed.v4i32.v4f32(<4 x float> %f) @@ -976,18 +920,6 @@ u64x2 widen_high_u_i32x4_i64x2(u32x4 x) { // WEBASSEMBLY: ret } -i32x4 widen_s_i8x16_i32x4(i8x16 x) { - return __builtin_wasm_widen_s_i8x16_i32x4(x, 3); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.signed(<16 x i8> %x, i32 3) - // WEBASSEMBLY: ret -} - -u32x4 widen_u_i8x16_i32x4(u8x16 x) { - return __builtin_wasm_widen_u_i8x16_i32x4(x, 3); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.widen.unsigned(<16 x i8> %x, i32 3) - // WEBASSEMBLY: ret -} - f64x2 convert_low_s_i32x4_f64x2(i32x4 x) { return __builtin_wasm_convert_low_s_i32x4_f64x2(x); // WEBASSEMBLY: call <2 x double> @llvm.wasm.convert.low.signed(<4 x i32> %x) @@ -1050,13 +982,3 @@ i8x16 shuffle(i8x16 x, i8x16 y) { // WEBASSEMBLY-SAME: i32 15 // WEBASSEMBLY-NEXT: ret } - -void prefetch_t(void *p) { - return __builtin_wasm_prefetch_t(p); - // WEBASSEMBLY: call void @llvm.wasm.prefetch.t(i8* %p) -} - -void prefetch_nt(void *p) { - return __builtin_wasm_prefetch_nt(p); - // WEBASSEMBLY: call void @llvm.wasm.prefetch.nt(i8* %p) -} diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td index 323b9a770c05..cd916e78f9f4 100644 --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -143,14 +143,6 @@ def int_wasm_bitmask : Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_qfma : - Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; -def int_wasm_qfms : - Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; def int_wasm_dot : Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], @@ -302,11 +294,6 @@ def int_wasm_extadd_pairwise_unsigned : [LLVMSubdivide2VectorType<0>], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_signselect : - Intrinsic<[llvm_anyvector_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem, IntrSpeculatable]>; - // TODO: Remove this intrinsic and the associated builtin if i64x2.eq gets // merged to the proposal. def int_wasm_eq : @@ -314,20 +301,6 @@ def int_wasm_eq : [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem, IntrSpeculatable]>; -// TODO: Remove this after experiments have been run. Use the target-agnostic -// int_prefetch if this becomes specified at some point. -def int_wasm_prefetch_t : - Intrinsic<[], [llvm_ptr_ty], - [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, - ReadOnly>, NoCapture>], - "", [SDNPMemOperand]>; - -def int_wasm_prefetch_nt : - Intrinsic<[], [llvm_ptr_ty], - [IntrInaccessibleMemOrArgMemOnly, IntrWillReturn, - ReadOnly>, NoCapture>], - "", [SDNPMemOperand]>; - // TODO: Remove these if possible if they are merged to the spec. def int_wasm_convert_low_signed : Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty], @@ -348,14 +321,6 @@ def int_wasm_promote_low : Intrinsic<[llvm_v2f64_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]>; -// TODO: Remove these if possible if they are merged to the spec. -def int_wasm_widen_signed : - Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, ImmArg>]>; -def int_wasm_widen_unsigned : - Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty, llvm_i32_ty], - [IntrNoMem, IntrSpeculatable, ImmArg>]>; - //===----------------------------------------------------------------------===// // Thread-local storage intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h index 5b77b8495adf..3508ec0ba98f 100644 --- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h +++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h @@ -196,8 +196,6 @@ inline unsigned GetDefaultP2AlignAny(unsigned Opc) { WASM_LOAD_STORE(LOAD8_SPLAT) WASM_LOAD_STORE(LOAD_LANE_I8x16) WASM_LOAD_STORE(STORE_LANE_I8x16) - WASM_LOAD_STORE(PREFETCH_T) - WASM_LOAD_STORE(PREFETCH_NT) return 0; WASM_LOAD_STORE(LOAD16_S_I32) WASM_LOAD_STORE(LOAD16_U_I32) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 8cf44b545e06..f28fe67b0b46 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -761,16 +761,6 @@ bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = MemAlign; return true; } - case Intrinsic::wasm_prefetch_t: - case Intrinsic::wasm_prefetch_nt: { - Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = MVT::i8; - Info.ptrVal = I.getArgOperand(0); - Info.offset = 0; - Info.align = Align(1); - Info.flags = MachineMemOperand::MOLoad; - return true; - } default: return false; } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index d1f8cf4f5c15..83f29acf6348 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -791,22 +791,6 @@ def : Pat<(select (SELECT_V128 $rhs, $lhs, $cond)>; } // foreach vec -// Sign select -multiclass SIMDSignSelect simdop> { - defm SIGNSELECT_#vec : - SIMD_I<(outs V128:$dst), (ins V128:$v1, V128:$v2, V128:$c), (outs), (ins), - [(set (vec.vt V128:$dst), - (vec.vt (int_wasm_signselect - (vec.vt V128:$v1), (vec.vt V128:$v2), (vec.vt V128:$c))))], - vec.prefix#".signselect\t$dst, $v1, $v2, $c", - vec.prefix#".signselect", simdop>; -} - -defm : SIMDSignSelect; -defm : SIMDSignSelect; -defm : SIMDSignSelect; -defm : SIMDSignSelect; - //===----------------------------------------------------------------------===// // Integer unary arithmetic //===----------------------------------------------------------------------===// @@ -1270,90 +1254,9 @@ defm "" : SIMDConvert; -// Prototype i8x16 to i32x4 widening -defm WIDEN_I8x16_TO_I32x4_S : - SIMD_I<(outs V128:$dst), (ins V128:$vec, vec_i8imm_op:$idx), - (outs), (ins vec_i8imm_op:$idx), - [(set (I32x4.vt V128:$dst), - (I32x4.vt (int_wasm_widen_signed - (I8x16.vt V128:$vec), (i32 timm:$idx))))], - "i32x4.widen_i8x16_s\t$dst, $vec, $idx", - "i32x4.widen_i8x16_s\t$idx", 0x67>; -defm WIDEN_I8x16_TO_I32x4_U : - SIMD_I<(outs V128:$dst), (ins V128:$vec, vec_i8imm_op:$idx), - (outs), (ins vec_i8imm_op:$idx), - [(set (I32x4.vt V128:$dst), - (I32x4.vt (int_wasm_widen_unsigned - (I8x16.vt V128:$vec), (i32 timm:$idx))))], - "i32x4.widen_i8x16_u\t$dst, $vec, $idx", - "i32x4.widen_i8x16_u\t$idx", 0x68>; - - -//===----------------------------------------------------------------------===// -// Quasi-Fused Multiply- Add and Subtract (QFMA/QFMS) -//===----------------------------------------------------------------------===// - -multiclass SIMDQFM simdopA, bits<32> simdopS> { - defm QFMA_#vec : - SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), - (outs), (ins), - [(set (vec.vt V128:$dst), (int_wasm_qfma - (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))], - vec.prefix#".qfma\t$dst, $a, $b, $c", vec.prefix#".qfma", simdopA>; - defm QFMS_#vec : - SIMD_I<(outs V128:$dst), (ins V128:$a, V128:$b, V128:$c), - (outs), (ins), - [(set (vec.vt V128:$dst), (int_wasm_qfms - (vec.vt V128:$a), (vec.vt V128:$b), (vec.vt V128:$c)))], - vec.prefix#".qfms\t$dst, $a, $b, $c", vec.prefix#".qfms", simdopS>; -} - -defm "" : SIMDQFM; -defm "" : SIMDQFM; - //===----------------------------------------------------------------------===// // Saturating Rounding Q-Format Multiplication //===----------------------------------------------------------------------===// defm Q15MULR_SAT_S : SIMDBinary; - -//===----------------------------------------------------------------------===// -// Experimental prefetch instructions: prefetch.t, prefetch.nt -//===----------------------------------------------------------------------===// - -let mayLoad = true, UseNamedOperandTable = true in { -defm PREFETCH_T_A32 : - SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr), - (outs), (ins P2Align:$p2align, offset32_op:$off), [], - "prefetch.t\t${off}(${addr})$p2align", - "prefetch.t\t$off$p2align", 0xc5>; -defm PREFETCH_T_A64 : - SIMD_I<(outs), (ins P2Align:$p2align, offset64_op:$off, I64:$addr), - (outs), (ins P2Align:$p2align, offset64_op:$off), [], - "prefetch.t\t${off}(${addr})$p2align", - "prefetch.t\t$off$p2align", 0xc5>; -defm PREFETCH_NT_A32 : - SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr), - (outs), (ins P2Align:$p2align, offset32_op:$off), [], - "prefetch.nt\t${off}(${addr})$p2align", - "prefetch.nt\t$off$p2align", 0xc6>; -defm PREFETCH_NT_A64 : - SIMD_I<(outs), (ins P2Align:$p2align, offset64_op:$off, I64:$addr), - (outs), (ins P2Align:$p2align, offset64_op:$off), [], - "prefetch.nt\t${off}(${addr})$p2align", - "prefetch.nt\t$off$p2align", 0xc6>; -} // mayLoad, UseNamedOperandTable - -multiclass PrefetchPatNoOffset { - def : Pat<(kind I32:$addr), (!cast(inst # "_A32") 0, 0, $addr)>, - Requires<[HasAddr32]>; - def : Pat<(kind I64:$addr), (!cast(inst # "_A64") 0, 0, $addr)>, - Requires<[HasAddr64]>; -} - -foreach inst = [["PREFETCH_T", "int_wasm_prefetch_t"], - ["PREFETCH_NT", "int_wasm_prefetch_nt"]] in { -defvar node = !cast(inst[1]); -defm : PrefetchPatNoOffset; -} diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll index a3b0d50903f6..606b8b6753d1 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -127,18 +127,6 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %c) { ret <16 x i8> %a } -; CHECK-LABEL: signselect_v16i8: -; CHECK-NEXT: .functype signselect_v16i8 (v128, v128, v128) -> (v128){{$}} -; CHECK-NEXT: i8x16.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <16 x i8> @llvm.wasm.signselect.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) -define <16 x i8> @signselect_v16i8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %c) { - %a = call <16 x i8> @llvm.wasm.signselect.v16i8( - <16 x i8> %v1, <16 x i8> %v2, <16 x i8> %c - ) - ret <16 x i8> %a -} - ; CHECK-LABEL: narrow_signed_v16i8: ; CHECK-NEXT: .functype narrow_signed_v16i8 (v128, v128) -> (v128){{$}} ; CHECK-NEXT: i8x16.narrow_i16x8_s $push[[R:[0-9]+]]=, $0, $1{{$}} @@ -371,18 +359,6 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %c) { ret <8 x i16> %a } -; CHECK-LABEL: signselect_v8i16: -; CHECK-NEXT: .functype signselect_v8i16 (v128, v128, v128) -> (v128){{$}} -; CHECK-NEXT: i16x8.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.signselect.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) -define <8 x i16> @signselect_v8i16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %c) { - %a = call <8 x i16> @llvm.wasm.signselect.v8i16( - <8 x i16> %v1, <8 x i16> %v2, <8 x i16> %c - ) - ret <8 x i16> %a -} - ; CHECK-LABEL: narrow_signed_v8i16: ; CHECK-NEXT: .functype narrow_signed_v8i16 (v128, v128) -> (v128){{$}} ; CHECK-NEXT: i16x8.narrow_i32x4_s $push[[R:[0-9]+]]=, $0, $1{{$}} @@ -532,18 +508,6 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %c) { ret <4 x i32> %a } -; CHECK-LABEL: signselect_v4i32: -; CHECK-NEXT: .functype signselect_v4i32 (v128, v128, v128) -> (v128){{$}} -; CHECK-NEXT: i32x4.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.signselect.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) -define <4 x i32> @signselect_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %c) { - %a = call <4 x i32> @llvm.wasm.signselect.v4i32( - <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %c - ) - ret <4 x i32> %a -} - ; CHECK-LABEL: trunc_sat_s_v4i32: ; NO-CHECK-NOT: f32x4 ; CHECK-NEXT: .functype trunc_sat_s_v4i32 (v128) -> (v128){{$}} @@ -586,27 +550,6 @@ define <4 x i32> @trunc_sat_zero_unsigned_v4i32(<2 x double> %a) { ret <4 x i32> %v } - -; CHECK-LABEL: widen_signed_v4i32: -; CHECK-NEXT: .functype widen_signed_v4i32 (v128) -> (v128){{$}} -; CHECK-NEXT: i32x4.widen_i8x16_s $push[[R:[0-9]+]]=, $0, 1{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.signed(<16 x i8>, i32 immarg) -define <4 x i32> @widen_signed_v4i32(<16 x i8> %x) { - %v = call <4 x i32> @llvm.wasm.widen.signed(<16 x i8> %x, i32 1) - ret <4 x i32> %v -} - -; CHECK-LABEL: widen_unsigned_v4i32: -; CHECK-NEXT: .functype widen_unsigned_v4i32 (v128) -> (v128){{$}} -; CHECK-NEXT: i32x4.widen_i8x16_u $push[[R:[0-9]+]]=, $0, 1{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.widen.unsigned(<16 x i8>, i32 immarg) -define <4 x i32> @widen_unsigned_v4i32(<16 x i8> %x) { - %v = call <4 x i32> @llvm.wasm.widen.unsigned(<16 x i8> %x, i32 1) - ret <4 x i32> %v -} - ; ============================================================================== ; 2 x i64 ; ============================================================================== @@ -750,18 +693,6 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %c) { ret <2 x i64> %a } -; CHECK-LABEL: signselect_v2i64: -; CHECK-NEXT: .functype signselect_v2i64 (v128, v128, v128) -> (v128){{$}} -; CHECK-NEXT: i64x2.signselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <2 x i64> @llvm.wasm.signselect.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) -define <2 x i64> @signselect_v2i64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %c) { - %a = call <2 x i64> @llvm.wasm.signselect.v2i64( - <2 x i64> %v1, <2 x i64> %v2, <2 x i64> %c - ) - ret <2 x i64> %a -} - ; ============================================================================== ; 4 x f32 ; ============================================================================== @@ -837,30 +768,6 @@ define <4 x float> @nearest_v4f32(<4 x float> %a) { ret <4 x float> %v } -; CHECK-LABEL: qfma_v4f32: -; CHECK-NEXT: .functype qfma_v4f32 (v128, v128, v128) -> (v128){{$}} -; CHECK-NEXT: f32x4.qfma $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <4 x float> @llvm.wasm.qfma.v4f32(<4 x float>, <4 x float>, <4 x float>) -define <4 x float> @qfma_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { - %v = call <4 x float> @llvm.wasm.qfma.v4f32( - <4 x float> %a, <4 x float> %b, <4 x float> %c - ) - ret <4 x float> %v -} - -; CHECK-LABEL: qfms_v4f32: -; CHECK-NEXT: .functype qfms_v4f32 (v128, v128, v128) -> (v128){{$}} -; CHECK-NEXT: f32x4.qfms $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <4 x float> @llvm.wasm.qfms.v4f32(<4 x float>, <4 x float>, <4 x float>) -define <4 x float> @qfms_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) { - %v = call <4 x float> @llvm.wasm.qfms.v4f32( - <4 x float> %a, <4 x float> %b, <4 x float> %c - ) - ret <4 x float> %v -} - ; CHECK-LABEL: demote_zero_v4f32: ; CHECK-NEXT: .functype demote_zero_v4f32 (v128) -> (v128){{$}} ; CHECK-NEXT: f32x4.demote_zero_f64x2 $push[[R:[0-9]+]]=, $0{{$}} @@ -946,30 +853,6 @@ define <2 x double> @nearest_v2f64(<2 x double> %a) { ret <2 x double> %v } -; CHECK-LABEL: qfma_v2f64: -; CHECK-NEXT: .functype qfma_v2f64 (v128, v128, v128) -> (v128){{$}} -; CHECK-NEXT: f64x2.qfma $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <2 x double> @llvm.wasm.qfma.v2f64(<2 x double>, <2 x double>, <2 x double>) -define <2 x double> @qfma_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { - %v = call <2 x double> @llvm.wasm.qfma.v2f64( - <2 x double> %a, <2 x double> %b, <2 x double> %c - ) - ret <2 x double> %v -} - -; CHECK-LABEL: qfms_v2f64: -; CHECK-NEXT: .functype qfms_v2f64 (v128, v128, v128) -> (v128){{$}} -; CHECK-NEXT: f64x2.qfms $push[[R:[0-9]+]]=, $0, $1, $2{{$}} -; CHECK-NEXT: return $pop[[R]]{{$}} -declare <2 x double> @llvm.wasm.qfms.v2f64(<2 x double>, <2 x double>, <2 x double>) -define <2 x double> @qfms_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) { - %v = call <2 x double> @llvm.wasm.qfms.v2f64( - <2 x double> %a, <2 x double> %b, <2 x double> %c - ) - ret <2 x double> %v -} - ; CHECK-LABEL: convert_low_signed_v2f64: ; CHECK-NEXT: .functype convert_low_signed_v2f64 (v128) -> (v128){{$}} ; CHECK-NEXT: f64x2.convert_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} diff --git a/llvm/test/CodeGen/WebAssembly/simd-prefetch-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-prefetch-offset.ll deleted file mode 100644 index f3b54481c0e4..000000000000 --- a/llvm/test/CodeGen/WebAssembly/simd-prefetch-offset.ll +++ /dev/null @@ -1,235 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s - -; Test experimental prefetch instructions - -target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" -target triple = "wasm32-unknown-unknown" - -declare void @llvm.wasm.prefetch.t(i8*) -declare void @llvm.wasm.prefetch.nt(i8*) -@gv = global i8 0 - -;===---------------------------------------------------------------------------- -; prefetch.t -;===---------------------------------------------------------------------------- - -define void @prefetch_t_no_offset(i8* %p) { -; CHECK-LABEL: prefetch_t_no_offset: -; CHECK: .functype prefetch_t_no_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: prefetch.t 0 -; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.prefetch.t(i8* %p) - ret void -} - -define void @prefetch_t_with_folded_offset(i8* %p) { -; CHECK-LABEL: prefetch_t_with_folded_offset: -; CHECK: .functype prefetch_t_with_folded_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.t 0 -; CHECK-NEXT: # fallthrough-return - %q = ptrtoint i8* %p to i32 - %r = add nuw i32 %q, 24 - %s = inttoptr i32 %r to i8* - tail call void @llvm.wasm.prefetch.t(i8* %s) - ret void -} - -define void @prefetch_t_with_folded_gep_offset(i8* %p) { -; CHECK-LABEL: prefetch_t_with_folded_gep_offset: -; CHECK: .functype prefetch_t_with_folded_gep_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const 6 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.t 0 -; CHECK-NEXT: # fallthrough-return - %s = getelementptr inbounds i8, i8* %p, i32 6 - tail call void @llvm.wasm.prefetch.t(i8* %s) - ret void -} - -define void @prefetch_t_with_unfolded_gep_negative_offset(i8* %p) { -; CHECK-LABEL: prefetch_t_with_unfolded_gep_negative_offset: -; CHECK: .functype prefetch_t_with_unfolded_gep_negative_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const -6 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.t 0 -; CHECK-NEXT: # fallthrough-return - %s = getelementptr inbounds i8, i8* %p, i32 -6 - tail call void @llvm.wasm.prefetch.t(i8* %s) - ret void -} - -define void @prefetch_t_with_unfolded_offset(i8* %p) { -; CHECK-LABEL: prefetch_t_with_unfolded_offset: -; CHECK: .functype prefetch_t_with_unfolded_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.t 0 -; CHECK-NEXT: # fallthrough-return - %q = ptrtoint i8* %p to i32 - %r = add nsw i32 %q, 24 - %s = inttoptr i32 %r to i8* - tail call void @llvm.wasm.prefetch.t(i8* %s) - ret void -} - -define void @prefetch_t_with_unfolded_gep_offset(i8* %p) { -; CHECK-LABEL: prefetch_t_with_unfolded_gep_offset: -; CHECK: .functype prefetch_t_with_unfolded_gep_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const 6 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.t 0 -; CHECK-NEXT: # fallthrough-return - %s = getelementptr i8, i8* %p, i32 6 - tail call void @llvm.wasm.prefetch.t(i8* %s) - ret void -} - -define void @prefetch_t_from_numeric_address() { -; CHECK-LABEL: prefetch_t_from_numeric_address: -; CHECK: .functype prefetch_t_from_numeric_address () -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i32.const 42 -; CHECK-NEXT: prefetch.t 0 -; CHECK-NEXT: # fallthrough-return - %s = inttoptr i32 42 to i8* - tail call void @llvm.wasm.prefetch.t(i8* %s) - ret void -} - -define void @prefetch_t_from_global_address() { -; CHECK-LABEL: prefetch_t_from_global_address: -; CHECK: .functype prefetch_t_from_global_address () -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i32.const gv -; CHECK-NEXT: prefetch.t 0 -; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.prefetch.t(i8* @gv) - ret void -} - -;===---------------------------------------------------------------------------- -; prefetch.nt -;===---------------------------------------------------------------------------- - -define void @prefetch_nt_no_offset(i8* %p) { -; CHECK-LABEL: prefetch_nt_no_offset: -; CHECK: .functype prefetch_nt_no_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: prefetch.nt 0 -; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.prefetch.nt(i8* %p) - ret void -} - -define void @prefetch_nt_with_folded_offset(i8* %p) { -; CHECK-LABEL: prefetch_nt_with_folded_offset: -; CHECK: .functype prefetch_nt_with_folded_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.nt 0 -; CHECK-NEXT: # fallthrough-return - %q = ptrtoint i8* %p to i32 - %r = add nuw i32 %q, 24 - %s = inttoptr i32 %r to i8* - tail call void @llvm.wasm.prefetch.nt(i8* %s) - ret void -} - -define void @prefetch_nt_with_folded_gep_offset(i8* %p) { -; CHECK-LABEL: prefetch_nt_with_folded_gep_offset: -; CHECK: .functype prefetch_nt_with_folded_gep_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const 6 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.nt 0 -; CHECK-NEXT: # fallthrough-return - %s = getelementptr inbounds i8, i8* %p, i64 6 - tail call void @llvm.wasm.prefetch.nt(i8* %s) - ret void -} - -define void @prefetch_nt_with_unfolded_gep_negative_offset(i8* %p) { -; CHECK-LABEL: prefetch_nt_with_unfolded_gep_negative_offset: -; CHECK: .functype prefetch_nt_with_unfolded_gep_negative_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const -6 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.nt 0 -; CHECK-NEXT: # fallthrough-return - %s = getelementptr inbounds i8, i8* %p, i64 -6 - tail call void @llvm.wasm.prefetch.nt(i8* %s) - ret void -} - -define void @prefetch_nt_with_unfolded_offset(i8* %p) { -; CHECK-LABEL: prefetch_nt_with_unfolded_offset: -; CHECK: .functype prefetch_nt_with_unfolded_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const 24 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.nt 0 -; CHECK-NEXT: # fallthrough-return - %q = ptrtoint i8* %p to i32 - %r = add nsw i32 %q, 24 - %s = inttoptr i32 %r to i8* - tail call void @llvm.wasm.prefetch.nt(i8* %s) - ret void -} - -define void @prefetch_nt_with_unfolded_gep_offset(i8* %p) { -; CHECK-LABEL: prefetch_nt_with_unfolded_gep_offset: -; CHECK: .functype prefetch_nt_with_unfolded_gep_offset (i32) -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32.const 6 -; CHECK-NEXT: i32.add -; CHECK-NEXT: prefetch.nt 0 -; CHECK-NEXT: # fallthrough-return - %s = getelementptr i8, i8* %p, i64 6 - tail call void @llvm.wasm.prefetch.nt(i8* %s) - ret void -} - -define void @prefetch_nt_from_numeric_address() { -; CHECK-LABEL: prefetch_nt_from_numeric_address: -; CHECK: .functype prefetch_nt_from_numeric_address () -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i32.const 42 -; CHECK-NEXT: prefetch.nt 0 -; CHECK-NEXT: # fallthrough-return - %s = inttoptr i32 42 to i8* - tail call void @llvm.wasm.prefetch.nt(i8* %s) - ret void -} - -define void @prefetch_nt_from_global_address() { -; CHECK-LABEL: prefetch_nt_from_global_address: -; CHECK: .functype prefetch_nt_from_global_address () -> () -; CHECK-NEXT: # %bb.0: -; CHECK-NEXT: i32.const gv -; CHECK-NEXT: prefetch.nt 0 -; CHECK-NEXT: # fallthrough-return - tail call void @llvm.wasm.prefetch.nt(i8* @gv) - ret void -} diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index 4ecf5e487665..f9f4a553a63d 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -664,18 +664,6 @@ main: # CHECK: v128.load64_zero 32 # encoding: [0xfd,0xfd,0x01,0x03,0x20] v128.load64_zero 32 - # CHECK: f32x4.qfma # encoding: [0xfd,0xb4,0x01] - f32x4.qfma - - # CHECK: f32x4.qfms # encoding: [0xfd,0xd4,0x01] - f32x4.qfms - - # CHECK: f64x2.qfma # encoding: [0xfd,0xfe,0x01] - f64x2.qfma - - # CHECK: f64x2.qfms # encoding: [0xfd,0xff,0x01] - f64x2.qfms - # CHECK: i16x8.extmul_low_i8x16_s # encoding: [0xfd,0x9a,0x01] i16x8.extmul_low_i8x16_s @@ -712,18 +700,6 @@ main: # CHECK: i64x2.extmul_high_i32x4_u # encoding: [0xfd,0xd7,0x01] i64x2.extmul_high_i32x4_u - # CHECK: i8x16.signselect # encoding: [0xfd,0x7d] - i8x16.signselect - - # CHECK: i16x8.signselect # encoding: [0xfd,0x7e] - i16x8.signselect - - # CHECK: i32x4.signselect # encoding: [0xfd,0x7f] - i32x4.signselect - - # CHECK: i64x2.signselect # encoding: [0xfd,0x94,0x01] - i64x2.signselect - # CHECK: i16x8.extadd_pairwise_i8x16_s # encoding: [0xfd,0xc2,0x01] i16x8.extadd_pairwise_i8x16_s @@ -736,12 +712,6 @@ main: # CHECK: i32x4.extadd_pairwise_i16x8_u # encoding: [0xfd,0xa6,0x01] i32x4.extadd_pairwise_i16x8_u - # CHECK: prefetch.t 16 # encoding: [0xfd,0xc5,0x01,0x00,0x10] - prefetch.t 16 - - # CHECK: prefetch.nt 16 # encoding: [0xfd,0xc6,0x01,0x00,0x10] - prefetch.nt 16 - # CHECK: f64x2.convert_low_i32x4_s # encoding: [0xfd,0x53] f64x2.convert_low_i32x4_s @@ -760,10 +730,4 @@ main: # CHECK: f64x2.promote_low_f32x4 # encoding: [0xfd,0x69] f64x2.promote_low_f32x4 - # CHECK: i32x4.widen_i8x16_s 3 # encoding: [0xfd,0x67,0x03] - i32x4.widen_i8x16_s 3 - - # CHECK: i32x4.widen_i8x16_u 3 # encoding: [0xfd,0x68,0x03] - i32x4.widen_i8x16_u 3 - end_function -- GitLab From f5764a8654e3caa6ca5dab3a89238c165062228f Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 18 Mar 2021 11:21:24 -0700 Subject: [PATCH 0071/1000] [WebAssembly] Finalize SIMD names and opcodes Updates the names (e.g. widen => extend, saturate => sat) and opcodes of all SIMD instructions to match the finalized SIMD spec. Deliberately does not change the public interface in wasm_simd128.h yet; that will require more care. Depends on D98466. Differential Revision: https://reviews.llvm.org/D98676 --- .../clang/Basic/BuiltinsWebAssembly.def | 30 +- clang/lib/CodeGen/CGBuiltin.cpp | 77 ++-- clang/lib/Headers/wasm_simd128.h | 24 +- clang/test/CodeGen/builtins-wasm.c | 82 ++--- llvm/include/llvm/IR/IntrinsicsWebAssembly.td | 20 +- .../lib/Target/WebAssembly/WebAssemblyISD.def | 8 +- .../WebAssembly/WebAssemblyISelLowering.cpp | 14 +- .../WebAssembly/WebAssemblyInstrSIMD.td | 154 ++++---- .../{simd-widening.ll => simd-extending.ll} | 118 +++---- .../CodeGen/WebAssembly/simd-intrinsics.ll | 92 ++--- llvm/test/MC/WebAssembly/simd-encodings.s | 328 ++++++++++-------- 11 files changed, 489 insertions(+), 458 deletions(-) rename llvm/test/CodeGen/WebAssembly/{simd-widening.ll => simd-extending.ll} (59%) diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index 2f51376ba15a..6ea59026cd02 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -84,15 +84,15 @@ TARGET_BUILTIN(__builtin_wasm_replace_lane_i64x2, "V2LLiV2LLiIiLLi", "nc", "simd TARGET_BUILTIN(__builtin_wasm_replace_lane_f32x4, "V4fV4fIif", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_replace_lane_f64x2, "V2dV2dIid", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_add_saturate_s_i8x16, "V16ScV16ScV16Sc", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_add_saturate_u_i8x16, "V16UcV16UcV16Uc", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_add_saturate_s_i16x8, "V8sV8sV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_add_saturate_u_i16x8, "V8UsV8UsV8Us", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_add_sat_s_i8x16, "V16ScV16ScV16Sc", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_add_sat_u_i8x16, "V16UcV16UcV16Uc", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_add_sat_s_i16x8, "V8sV8sV8s", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_add_sat_u_i16x8, "V8UsV8UsV8Us", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_sub_saturate_s_i8x16, "V16ScV16ScV16Sc", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_sub_saturate_u_i8x16, "V16UcV16UcV16Uc", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_sub_saturate_s_i16x8, "V8sV8sV8s", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_sub_saturate_u_i16x8, "V8UsV8UsV8Us", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_sub_sat_s_i8x16, "V16ScV16ScV16Sc", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_sub_sat_u_i8x16, "V16UcV16UcV16Uc", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_sub_sat_s_i16x8, "V8sV8sV8s", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_sub_sat_u_i16x8, "V8UsV8UsV8Us", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_abs_i8x16, "V16ScV16Sc", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_abs_i16x8, "V8sV8s", "nc", "simd128") @@ -116,7 +116,7 @@ TARGET_BUILTIN(__builtin_wasm_avgr_u_i16x8, "V8UsV8UsV8Us", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_popcnt_i8x16, "V16ScV16Sc", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_q15mulr_saturate_s_i16x8, "V8sV8sV8s", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_q15mulr_sat_s_i16x8, "V8sV8sV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_extmul_low_i8x16_s_i16x8, "V8sV16ScV16Sc", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_extmul_high_i8x16_s_i16x8, "V8sV16ScV16Sc", "nc", "simd128") @@ -191,15 +191,15 @@ TARGET_BUILTIN(__builtin_wasm_narrow_u_i8x16_i16x8, "V16UcV8UsV8Us", "nc", "simd TARGET_BUILTIN(__builtin_wasm_narrow_s_i16x8_i32x4, "V8sV4iV4i", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_narrow_u_i16x8_i32x4, "V8UsV4UiV4Ui", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_s_i32x4_i64x2, "V2LLiV4i", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_s_i32x4_i64x2, "V2LLiV4i", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_low_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_widen_high_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_extend_low_s_i32x4_i64x2, "V2LLiV4i", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_extend_high_s_i32x4_i64x2, "V2LLiV4i", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_extend_low_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_extend_high_u_i32x4_i64x2, "V2LLUiV4Ui", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_convert_low_s_i32x4_f64x2, "V2dV4i", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_convert_low_u_i32x4_f64x2, "V2dV4Ui", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_trunc_saturate_zero_s_f64x2_i32x4, "V4iV2d", "nc", "simd128") -TARGET_BUILTIN(__builtin_wasm_trunc_saturate_zero_u_f64x2_i32x4, "V4UiV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4, "V4iV2d", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4, "V4UiV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_demote_zero_f64x2_f32x4, "V4fV2d", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_promote_low_f32x4_f64x2, "V2dV4f", "nc", "simd128") diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 96df7b0d6222..33a444e471f5 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -17194,31 +17194,31 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, llvm_unreachable("unexpected builtin ID"); } } - case WebAssembly::BI__builtin_wasm_add_saturate_s_i8x16: - case WebAssembly::BI__builtin_wasm_add_saturate_u_i8x16: - case WebAssembly::BI__builtin_wasm_add_saturate_s_i16x8: - case WebAssembly::BI__builtin_wasm_add_saturate_u_i16x8: - case WebAssembly::BI__builtin_wasm_sub_saturate_s_i8x16: - case WebAssembly::BI__builtin_wasm_sub_saturate_u_i8x16: - case WebAssembly::BI__builtin_wasm_sub_saturate_s_i16x8: - case WebAssembly::BI__builtin_wasm_sub_saturate_u_i16x8: { + case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16: + case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16: + case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8: + case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8: + case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16: + case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16: + case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8: + case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8: { unsigned IntNo; switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_add_saturate_s_i8x16: - case WebAssembly::BI__builtin_wasm_add_saturate_s_i16x8: + case WebAssembly::BI__builtin_wasm_add_sat_s_i8x16: + case WebAssembly::BI__builtin_wasm_add_sat_s_i16x8: IntNo = Intrinsic::sadd_sat; break; - case WebAssembly::BI__builtin_wasm_add_saturate_u_i8x16: - case WebAssembly::BI__builtin_wasm_add_saturate_u_i16x8: + case WebAssembly::BI__builtin_wasm_add_sat_u_i8x16: + case WebAssembly::BI__builtin_wasm_add_sat_u_i16x8: IntNo = Intrinsic::uadd_sat; break; - case WebAssembly::BI__builtin_wasm_sub_saturate_s_i8x16: - case WebAssembly::BI__builtin_wasm_sub_saturate_s_i16x8: - IntNo = Intrinsic::wasm_sub_saturate_signed; + case WebAssembly::BI__builtin_wasm_sub_sat_s_i8x16: + case WebAssembly::BI__builtin_wasm_sub_sat_s_i16x8: + IntNo = Intrinsic::wasm_sub_sat_signed; break; - case WebAssembly::BI__builtin_wasm_sub_saturate_u_i8x16: - case WebAssembly::BI__builtin_wasm_sub_saturate_u_i16x8: - IntNo = Intrinsic::wasm_sub_saturate_unsigned; + case WebAssembly::BI__builtin_wasm_sub_sat_u_i8x16: + case WebAssembly::BI__builtin_wasm_sub_sat_u_i16x8: + IntNo = Intrinsic::wasm_sub_sat_unsigned; break; default: llvm_unreachable("unexpected builtin ID"); @@ -17286,11 +17286,10 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } - case WebAssembly::BI__builtin_wasm_q15mulr_saturate_s_i16x8: { + case WebAssembly::BI__builtin_wasm_q15mulr_sat_s_i16x8: { Value *LHS = EmitScalarExpr(E->getArg(0)); Value *RHS = EmitScalarExpr(E->getArg(1)); - Function *Callee = - CGM.getIntrinsic(Intrinsic::wasm_q15mulr_saturate_signed); + Function *Callee = CGM.getIntrinsic(Intrinsic::wasm_q15mulr_sat_signed); return Builder.CreateCall(Callee, {LHS, RHS}); } case WebAssembly::BI__builtin_wasm_extmul_low_i8x16_s_i16x8: @@ -17456,24 +17455,24 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, CGM.getIntrinsic(IntNo, {ConvertType(E->getType()), Low->getType()}); return Builder.CreateCall(Callee, {Low, High}); } - case WebAssembly::BI__builtin_wasm_widen_low_s_i32x4_i64x2: - case WebAssembly::BI__builtin_wasm_widen_high_s_i32x4_i64x2: - case WebAssembly::BI__builtin_wasm_widen_low_u_i32x4_i64x2: - case WebAssembly::BI__builtin_wasm_widen_high_u_i32x4_i64x2: { + case WebAssembly::BI__builtin_wasm_extend_low_s_i32x4_i64x2: + case WebAssembly::BI__builtin_wasm_extend_high_s_i32x4_i64x2: + case WebAssembly::BI__builtin_wasm_extend_low_u_i32x4_i64x2: + case WebAssembly::BI__builtin_wasm_extend_high_u_i32x4_i64x2: { Value *Vec = EmitScalarExpr(E->getArg(0)); unsigned IntNo; switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_widen_low_s_i32x4_i64x2: - IntNo = Intrinsic::wasm_widen_low_signed; + case WebAssembly::BI__builtin_wasm_extend_low_s_i32x4_i64x2: + IntNo = Intrinsic::wasm_extend_low_signed; break; - case WebAssembly::BI__builtin_wasm_widen_high_s_i32x4_i64x2: - IntNo = Intrinsic::wasm_widen_high_signed; + case WebAssembly::BI__builtin_wasm_extend_high_s_i32x4_i64x2: + IntNo = Intrinsic::wasm_extend_high_signed; break; - case WebAssembly::BI__builtin_wasm_widen_low_u_i32x4_i64x2: - IntNo = Intrinsic::wasm_widen_low_unsigned; + case WebAssembly::BI__builtin_wasm_extend_low_u_i32x4_i64x2: + IntNo = Intrinsic::wasm_extend_low_unsigned; break; - case WebAssembly::BI__builtin_wasm_widen_high_u_i32x4_i64x2: - IntNo = Intrinsic::wasm_widen_high_unsigned; + case WebAssembly::BI__builtin_wasm_extend_high_u_i32x4_i64x2: + IntNo = Intrinsic::wasm_extend_high_unsigned; break; default: llvm_unreachable("unexpected builtin ID"); @@ -17498,16 +17497,16 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, Function *Callee = CGM.getIntrinsic(IntNo); return Builder.CreateCall(Callee, Vec); } - case WebAssembly::BI__builtin_wasm_trunc_saturate_zero_s_f64x2_i32x4: - case WebAssembly::BI__builtin_wasm_trunc_saturate_zero_u_f64x2_i32x4: { + case WebAssembly::BI__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4: + case WebAssembly::BI__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4: { Value *Vec = EmitScalarExpr(E->getArg(0)); unsigned IntNo; switch (BuiltinID) { - case WebAssembly::BI__builtin_wasm_trunc_saturate_zero_s_f64x2_i32x4: - IntNo = Intrinsic::wasm_trunc_saturate_zero_signed; + case WebAssembly::BI__builtin_wasm_trunc_sat_zero_s_f64x2_i32x4: + IntNo = Intrinsic::wasm_trunc_sat_zero_signed; break; - case WebAssembly::BI__builtin_wasm_trunc_saturate_zero_u_f64x2_i32x4: - IntNo = Intrinsic::wasm_trunc_saturate_zero_unsigned; + case WebAssembly::BI__builtin_wasm_trunc_sat_zero_u_f64x2_i32x4: + IntNo = Intrinsic::wasm_trunc_sat_zero_unsigned; break; default: llvm_unreachable("unexpected builtin ID"); diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index ac88516ac924..20f5a85b3224 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -616,14 +616,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_add_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_saturate_s_i8x16((__i8x16)__a, - (__i8x16)__b); + return (v128_t)__builtin_wasm_add_sat_s_i8x16((__i8x16)__a, (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_add_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_saturate_u_i8x16((__u8x16)__a, - (__u8x16)__b); + return (v128_t)__builtin_wasm_add_sat_u_i8x16((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a, @@ -633,14 +631,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_sub_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_saturate_s_i8x16((__i8x16)__a, - (__i8x16)__b); + return (v128_t)__builtin_wasm_sub_sat_s_i8x16((__i8x16)__a, (__i8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u8x16_sub_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_saturate_u_i8x16((__u8x16)__a, - (__u8x16)__b); + return (v128_t)__builtin_wasm_sub_sat_u_i8x16((__u8x16)__a, (__u8x16)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i8x16_min(v128_t __a, @@ -706,14 +702,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_add_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_saturate_s_i16x8((__i16x8)__a, - (__i16x8)__b); + return (v128_t)__builtin_wasm_add_sat_s_i16x8((__i16x8)__a, (__i16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_add_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_add_saturate_u_i16x8((__u16x8)__a, - (__u16x8)__b); + return (v128_t)__builtin_wasm_add_sat_u_i16x8((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a, @@ -723,14 +717,12 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub(v128_t __a, static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_sub_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_saturate_s_i16x8((__i16x8)__a, - (__i16x8)__b); + return (v128_t)__builtin_wasm_sub_sat_s_i16x8((__i16x8)__a, (__i16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_u16x8_sub_saturate(v128_t __a, v128_t __b) { - return (v128_t)__builtin_wasm_sub_saturate_u_i16x8((__u16x8)__a, - (__u16x8)__b); + return (v128_t)__builtin_wasm_sub_sat_u_i16x8((__u16x8)__a, (__u16x8)__b); } static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i16x8_mul(v128_t __a, diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index 71816ceda469..f635e6825896 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -340,44 +340,44 @@ void store64_lane(long long *p, i64x2 v) { // WEBASSEMBLY-NEXT: ret } -i8x16 add_saturate_s_i8x16(i8x16 x, i8x16 y) { - return __builtin_wasm_add_saturate_s_i8x16(x, y); +i8x16 add_sat_s_i8x16(i8x16 x, i8x16 y) { + return __builtin_wasm_add_sat_s_i8x16(x, y); // WEBASSEMBLY: call <16 x i8> @llvm.sadd.sat.v16i8( // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y) // WEBASSEMBLY-NEXT: ret } -u8x16 add_saturate_u_i8x16(u8x16 x, u8x16 y) { - return __builtin_wasm_add_saturate_u_i8x16(x, y); +u8x16 add_sat_u_i8x16(u8x16 x, u8x16 y) { + return __builtin_wasm_add_sat_u_i8x16(x, y); // WEBASSEMBLY: call <16 x i8> @llvm.uadd.sat.v16i8( // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y) // WEBASSEMBLY-NEXT: ret } -i16x8 add_saturate_s_i16x8(i16x8 x, i16x8 y) { - return __builtin_wasm_add_saturate_s_i16x8(x, y); +i16x8 add_sat_s_i16x8(i16x8 x, i16x8 y) { + return __builtin_wasm_add_sat_s_i16x8(x, y); // WEBASSEMBLY: call <8 x i16> @llvm.sadd.sat.v8i16( // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y) // WEBASSEMBLY-NEXT: ret } -u16x8 add_saturate_u_i16x8(u16x8 x, u16x8 y) { - return __builtin_wasm_add_saturate_u_i16x8(x, y); +u16x8 add_sat_u_i16x8(u16x8 x, u16x8 y) { + return __builtin_wasm_add_sat_u_i16x8(x, y); // WEBASSEMBLY: call <8 x i16> @llvm.uadd.sat.v8i16( // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y) // WEBASSEMBLY-NEXT: ret } -i8x16 sub_saturate_s_i8x16(i8x16 x, i8x16 y) { - return __builtin_wasm_sub_saturate_s_i8x16(x, y); - // WEBASSEMBLY: call <16 x i8> @llvm.wasm.sub.saturate.signed.v16i8( +i8x16 sub_sat_s_i8x16(i8x16 x, i8x16 y) { + return __builtin_wasm_sub_sat_s_i8x16(x, y); + // WEBASSEMBLY: call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8( // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y) // WEBASSEMBLY-NEXT: ret } -u8x16 sub_saturate_u_i8x16(u8x16 x, u8x16 y) { - return __builtin_wasm_sub_saturate_u_i8x16(x, y); - // WEBASSEMBLY: call <16 x i8> @llvm.wasm.sub.saturate.unsigned.v16i8( +u8x16 sub_sat_u_i8x16(u8x16 x, u8x16 y) { + return __builtin_wasm_sub_sat_u_i8x16(x, y); + // WEBASSEMBLY: call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8( // WEBASSEMBLY-SAME: <16 x i8> %x, <16 x i8> %y) // WEBASSEMBLY-NEXT: ret } @@ -484,16 +484,16 @@ u32x4 max_u_i32x4(u32x4 x, u32x4 y) { // WEBASSEMBLY-NEXT: ret <4 x i32> %1 } -i16x8 sub_saturate_s_i16x8(i16x8 x, i16x8 y) { - return __builtin_wasm_sub_saturate_s_i16x8(x, y); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.sub.saturate.signed.v8i16( +i16x8 sub_sat_s_i16x8(i16x8 x, i16x8 y) { + return __builtin_wasm_sub_sat_s_i16x8(x, y); + // WEBASSEMBLY: call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16( // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y) // WEBASSEMBLY-NEXT: ret } -u16x8 sub_saturate_u_i16x8(u16x8 x, u16x8 y) { - return __builtin_wasm_sub_saturate_u_i16x8(x, y); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.sub.saturate.unsigned.v8i16( +u16x8 sub_sat_u_i16x8(u16x8 x, u16x8 y) { + return __builtin_wasm_sub_sat_u_i16x8(x, y); + // WEBASSEMBLY: call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16( // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y) // WEBASSEMBLY-NEXT: ret } @@ -512,9 +512,9 @@ u16x8 avgr_u_i16x8(u16x8 x, u16x8 y) { // WEBASSEMBLY-NEXT: ret } -i16x8 q15mulr_saturate_s_i16x8(i16x8 x, i16x8 y) { - return __builtin_wasm_q15mulr_saturate_s_i16x8(x, y); - // WEBASSEMBLY: call <8 x i16> @llvm.wasm.q15mulr.saturate.signed( +i16x8 q15mulr_sat_s_i16x8(i16x8 x, i16x8 y) { + return __builtin_wasm_q15mulr_sat_s_i16x8(x, y); + // WEBASSEMBLY: call <8 x i16> @llvm.wasm.q15mulr.sat.signed( // WEBASSEMBLY-SAME: <8 x i16> %x, <8 x i16> %y) // WEBASSEMBLY-NEXT: ret } @@ -896,27 +896,27 @@ u16x8 narrow_u_i16x8_i32x4(u32x4 low, u32x4 high) { // WEBASSEMBLY: ret } -i64x2 widen_low_s_i32x4_i64x2(i32x4 x) { - return __builtin_wasm_widen_low_s_i32x4_i64x2(x); - // WEBASSEMBLY: call <2 x i64> @llvm.wasm.widen.low.signed(<4 x i32> %x) +i64x2 extend_low_s_i32x4_i64x2(i32x4 x) { + return __builtin_wasm_extend_low_s_i32x4_i64x2(x); + // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extend.low.signed(<4 x i32> %x) // WEBASSEMBLY: ret } -i64x2 widen_high_s_i32x4_i64x2(i32x4 x) { - return __builtin_wasm_widen_high_s_i32x4_i64x2(x); - // WEBASSEMBLY: call <2 x i64> @llvm.wasm.widen.high.signed(<4 x i32> %x) +i64x2 extend_high_s_i32x4_i64x2(i32x4 x) { + return __builtin_wasm_extend_high_s_i32x4_i64x2(x); + // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extend.high.signed(<4 x i32> %x) // WEBASSEMBLY: ret } -u64x2 widen_low_u_i32x4_i64x2(u32x4 x) { - return __builtin_wasm_widen_low_u_i32x4_i64x2(x); - // WEBASSEMBLY: call <2 x i64> @llvm.wasm.widen.low.unsigned(<4 x i32> %x) +u64x2 extend_low_u_i32x4_i64x2(u32x4 x) { + return __builtin_wasm_extend_low_u_i32x4_i64x2(x); + // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extend.low.unsigned(<4 x i32> %x) // WEBASSEMBLY: ret } -u64x2 widen_high_u_i32x4_i64x2(u32x4 x) { - return __builtin_wasm_widen_high_u_i32x4_i64x2(x); - // WEBASSEMBLY: call <2 x i64> @llvm.wasm.widen.high.unsigned(<4 x i32> %x) +u64x2 extend_high_u_i32x4_i64x2(u32x4 x) { + return __builtin_wasm_extend_high_u_i32x4_i64x2(x); + // WEBASSEMBLY: call <2 x i64> @llvm.wasm.extend.high.unsigned(<4 x i32> %x) // WEBASSEMBLY: ret } @@ -932,15 +932,15 @@ f64x2 convert_low_u_i32x4_f64x2(u32x4 x) { // WEBASSEMBLY: ret } -i32x4 trunc_saturate_zero_s_f64x2_i32x4(f64x2 x) { - return __builtin_wasm_trunc_saturate_zero_s_f64x2_i32x4(x); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.saturate.zero.signed(<2 x double> %x) +i32x4 trunc_sat_zero_s_f64x2_i32x4(f64x2 x) { + return __builtin_wasm_trunc_sat_zero_s_f64x2_i32x4(x); + // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.sat.zero.signed(<2 x double> %x) // WEBASSEMBLY: ret } -u32x4 trunc_saturate_zero_u_f64x2_i32x4(f64x2 x) { - return __builtin_wasm_trunc_saturate_zero_u_f64x2_i32x4(x); - // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.saturate.zero.unsigned(<2 x double> %x) +u32x4 trunc_sat_zero_u_f64x2_i32x4(f64x2 x) { + return __builtin_wasm_trunc_sat_zero_u_f64x2_i32x4(x); + // WEBASSEMBLY: call <4 x i32> @llvm.wasm.trunc.sat.zero.unsigned(<2 x double> %x) // WEBASSEMBLY: ret } diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td index cd916e78f9f4..7e7d151d22fe 100644 --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -115,11 +115,11 @@ def int_wasm_shuffle : llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_sub_saturate_signed : +def int_wasm_sub_sat_signed : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_sub_saturate_unsigned : +def int_wasm_sub_sat_unsigned : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; @@ -158,17 +158,17 @@ def int_wasm_narrow_unsigned : [IntrNoMem, IntrSpeculatable]>; // TODO: Replace these intrinsics with normal ISel patterns once i32x4 to i64x2 -// widening is merged to the proposal. -def int_wasm_widen_low_signed : +// extending is merged to the proposal. +def int_wasm_extend_low_signed : Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_high_signed : +def int_wasm_extend_high_signed : Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_low_unsigned : +def int_wasm_extend_low_unsigned : Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_widen_high_unsigned : +def int_wasm_extend_high_unsigned : Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_q15mulr_saturate_signed : +def int_wasm_q15mulr_sat_signed : Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, IntrSpeculatable]>; @@ -308,10 +308,10 @@ def int_wasm_convert_low_signed : def int_wasm_convert_low_unsigned : Intrinsic<[llvm_v2f64_ty], [llvm_v4i32_ty], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_trunc_saturate_zero_signed : +def int_wasm_trunc_sat_zero_signed : Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem, IntrSpeculatable]>; -def int_wasm_trunc_saturate_zero_unsigned : +def int_wasm_trunc_sat_zero_unsigned : Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem, IntrSpeculatable]>; def int_wasm_demote_zero : diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def index d75afdcefb7d..3a82dd45a5f6 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def @@ -29,10 +29,10 @@ HANDLE_NODETYPE(SWIZZLE) HANDLE_NODETYPE(VEC_SHL) HANDLE_NODETYPE(VEC_SHR_S) HANDLE_NODETYPE(VEC_SHR_U) -HANDLE_NODETYPE(WIDEN_LOW_S) -HANDLE_NODETYPE(WIDEN_LOW_U) -HANDLE_NODETYPE(WIDEN_HIGH_S) -HANDLE_NODETYPE(WIDEN_HIGH_U) +HANDLE_NODETYPE(EXTEND_LOW_S) +HANDLE_NODETYPE(EXTEND_LOW_U) +HANDLE_NODETYPE(EXTEND_HIGH_S) +HANDLE_NODETYPE(EXTEND_HIGH_U) HANDLE_NODETYPE(THROW) HANDLE_NODETYPE(CATCH) HANDLE_NODETYPE(MEMORY_COPY) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index f28fe67b0b46..85d2d2f60a53 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -1898,8 +1898,8 @@ performVECTOR_SHUFFLECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { return DAG.getBitcast(DstType, NewShuffle); } -static SDValue performVectorWidenCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { +static SDValue +performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { auto &DAG = DCI.DAG; assert(N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND); @@ -1933,10 +1933,10 @@ static SDValue performVectorWidenCombine(SDNode *N, bool IsSext = N->getOpcode() == ISD::SIGN_EXTEND; bool IsLow = Index == 0; - unsigned Op = IsSext ? (IsLow ? WebAssemblyISD::WIDEN_LOW_S - : WebAssemblyISD::WIDEN_HIGH_S) - : (IsLow ? WebAssemblyISD::WIDEN_LOW_U - : WebAssemblyISD::WIDEN_HIGH_U); + unsigned Op = IsSext ? (IsLow ? WebAssemblyISD::EXTEND_LOW_S + : WebAssemblyISD::EXTEND_HIGH_S) + : (IsLow ? WebAssemblyISD::EXTEND_LOW_U + : WebAssemblyISD::EXTEND_HIGH_U); return DAG.getNode(Op, SDLoc(N), ResVT, Source); } @@ -1951,6 +1951,6 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N, return performVECTOR_SHUFFLECombine(N, DCI); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: - return performVectorWidenCombine(N, DCI); + return performVectorExtendCombine(N, DCI); } } diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 83f29acf6348..9afb7a077796 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -266,8 +266,8 @@ multiclass SIMDLoadZero simdop> { // TODO: Also support v4f32 and v2f64 once the instructions are merged // to the proposal -defm "" : SIMDLoadZero; -defm "" : SIMDLoadZero; +defm "" : SIMDLoadZero; +defm "" : SIMDLoadZero; foreach vec = [I32x4, I64x2] in { defvar loadpat = !cast("int_wasm_load"#vec.lane_bits#"_zero"); @@ -302,10 +302,10 @@ multiclass SIMDLoadLane simdop> { // TODO: Also support v4f32 and v2f64 once the instructions are merged // to the proposal -defm "" : SIMDLoadLane; -defm "" : SIMDLoadLane; -defm "" : SIMDLoadLane; -defm "" : SIMDLoadLane; +defm "" : SIMDLoadLane; +defm "" : SIMDLoadLane; +defm "" : SIMDLoadLane; +defm "" : SIMDLoadLane; // Select loads with no constant offset. multiclass LoadLanePatNoOffset { @@ -375,10 +375,10 @@ multiclass SIMDStoreLane simdop> { // TODO: Also support v4f32 and v2f64 once the instructions are merged // to the proposal -defm "" : SIMDStoreLane; -defm "" : SIMDStoreLane; -defm "" : SIMDStoreLane; -defm "" : SIMDStoreLane; +defm "" : SIMDStoreLane; +defm "" : SIMDStoreLane; +defm "" : SIMDStoreLane; +defm "" : SIMDStoreLane; // Select stores with no constant offset. multiclass StoreLanePatNoOffset { @@ -917,19 +917,19 @@ multiclass SIMDBinaryInt baseInst> defm "" : SIMDBinary; } -// Integer addition: add / add_saturate_s / add_saturate_u +// Integer addition: add / add_sat_s / add_sat_u let isCommutable = 1 in { defm ADD : SIMDBinaryInt; -defm ADD_SAT_S : SIMDBinaryIntSmall; -defm ADD_SAT_U : SIMDBinaryIntSmall; +defm ADD_SAT_S : SIMDBinaryIntSmall; +defm ADD_SAT_U : SIMDBinaryIntSmall; } // isCommutable = 1 -// Integer subtraction: sub / sub_saturate_s / sub_saturate_u +// Integer subtraction: sub / sub_sat_s / sub_sat_u defm SUB : SIMDBinaryInt; defm SUB_SAT_S : - SIMDBinaryIntSmall; + SIMDBinaryIntSmall; defm SUB_SAT_U : - SIMDBinaryIntSmall; + SIMDBinaryIntSmall; // Integer multiplication: mul let isCommutable = 1 in @@ -980,31 +980,31 @@ multiclass SIMDExtBinary simdop> } defm EXTMUL_LOW_S : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_HIGH_S : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_LOW_U : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_HIGH_U : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_LOW_S : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_HIGH_S : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_LOW_U : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_HIGH_U : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_LOW_S : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_HIGH_S : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_LOW_U : - SIMDExtBinary; + SIMDExtBinary; defm EXTMUL_HIGH_U : - SIMDExtBinary; + SIMDExtBinary; //===----------------------------------------------------------------------===// // Floating-point unary arithmetic @@ -1025,14 +1025,14 @@ defm NEG : SIMDUnaryFP; defm SQRT : SIMDUnaryFP; // Rounding: ceil, floor, trunc, nearest -defm CEIL : SIMDUnary; -defm FLOOR : SIMDUnary; -defm TRUNC: SIMDUnary; -defm NEAREST: SIMDUnary; -defm CEIL : SIMDUnary; -defm FLOOR : SIMDUnary; -defm TRUNC: SIMDUnary; -defm NEAREST: SIMDUnary; +defm CEIL : SIMDUnary; +defm FLOOR : SIMDUnary; +defm TRUNC: SIMDUnary; +defm NEAREST: SIMDUnary; +defm CEIL : SIMDUnary; +defm FLOOR : SIMDUnary; +defm TRUNC: SIMDUnary; +defm NEAREST: SIMDUnary; //===----------------------------------------------------------------------===// // Floating-point binary arithmetic @@ -1089,42 +1089,42 @@ defm "" : SIMDConvert; defm "" : SIMDConvert; defm "" : SIMDConvert; -// Lower llvm.wasm.trunc.saturate.* to saturating instructions +// Lower llvm.wasm.trunc.sat.* to saturating instructions def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))), (fp_to_sint_I32x4 $src)>; def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))), (fp_to_uint_I32x4 $src)>; -// Widening operations -def widen_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; -def widen_low_s : SDNode<"WebAssemblyISD::WIDEN_LOW_S", widen_t>; -def widen_high_s : SDNode<"WebAssemblyISD::WIDEN_HIGH_S", widen_t>; -def widen_low_u : SDNode<"WebAssemblyISD::WIDEN_LOW_U", widen_t>; -def widen_high_u : SDNode<"WebAssemblyISD::WIDEN_HIGH_U", widen_t>; +// Extending operations +def extend_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; +def extend_low_s : SDNode<"WebAssemblyISD::EXTEND_LOW_S", extend_t>; +def extend_high_s : SDNode<"WebAssemblyISD::EXTEND_HIGH_S", extend_t>; +def extend_low_u : SDNode<"WebAssemblyISD::EXTEND_LOW_U", extend_t>; +def extend_high_u : SDNode<"WebAssemblyISD::EXTEND_HIGH_U", extend_t>; // TODO: refactor this to be uniform for i64x2 if the numbering is not changed. -multiclass SIMDWiden baseInst> { - defm "" : SIMDConvert; - defm "" : SIMDConvert; - defm "" : SIMDConvert; - defm "" : SIMDConvert; +multiclass SIMDExtend baseInst> { + defm "" : SIMDConvert; + defm "" : SIMDConvert; + defm "" : SIMDConvert; + defm "" : SIMDConvert; } -defm "" : SIMDWiden; -defm "" : SIMDWiden; +defm "" : SIMDExtend; +defm "" : SIMDExtend; -defm "" : SIMDConvert; -defm "" : SIMDConvert; -defm "" : SIMDConvert; -defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; // Narrowing operations multiclass SIMDNarrow baseInst> { @@ -1232,31 +1232,31 @@ def : Pat<(t1 (bitconvert (t2 V128:$v))), (t1 V128:$v)>; // Extended pairwise addition defm "" : SIMDConvert; + "extadd_pairwise_i8x16_s", 0x7c>; defm "" : SIMDConvert; + "extadd_pairwise_i8x16_u", 0x7d>; defm "" : SIMDConvert; + "extadd_pairwise_i16x8_s", 0x7e>; defm "" : SIMDConvert; + "extadd_pairwise_i16x8_u", 0x7f>; // Prototype f64x2 conversions -defm "" : SIMDConvert; -defm "" : SIMDConvert; -defm "" : SIMDConvert; -defm "" : SIMDConvert; defm "" : SIMDConvert; + "demote_zero_f64x2", 0x5e>; defm "" : SIMDConvert; + "promote_low_f32x4", 0x5f>; +defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; +defm "" : SIMDConvert; //===----------------------------------------------------------------------===// // Saturating Rounding Q-Format Multiplication //===----------------------------------------------------------------------===// defm Q15MULR_SAT_S : - SIMDBinary; + SIMDBinary; diff --git a/llvm/test/CodeGen/WebAssembly/simd-widening.ll b/llvm/test/CodeGen/WebAssembly/simd-extending.ll similarity index 59% rename from llvm/test/CodeGen/WebAssembly/simd-widening.ll rename to llvm/test/CodeGen/WebAssembly/simd-extending.ll index c9a7ffbbfcaf..3f512cd2678e 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-widening.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-extending.ll @@ -1,121 +1,121 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mattr=+simd128 | FileCheck %s -;; Test that SIMD widening operations can be successfully selected +;; Test that SIMD extending operations can be successfully selected target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" target triple = "wasm32-unknown-unknown" -define <8 x i16> @widen_low_i8x16_s(<16 x i8> %v) { -; CHECK-LABEL: widen_low_i8x16_s: -; CHECK: .functype widen_low_i8x16_s (v128) -> (v128) +define <8 x i16> @extend_low_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: extend_low_i8x16_s: +; CHECK: .functype extend_low_i8x16_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.widen_low_i8x16_s +; CHECK-NEXT: i16x8.extend_low_i8x16_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> - %widened = sext <8 x i8> %low to <8 x i16> - ret <8 x i16> %widened + %extended = sext <8 x i8> %low to <8 x i16> + ret <8 x i16> %extended } -define <8 x i16> @widen_low_i8x16_u(<16 x i8> %v) { -; CHECK-LABEL: widen_low_i8x16_u: -; CHECK: .functype widen_low_i8x16_u (v128) -> (v128) +define <8 x i16> @extend_low_i8x16_u(<16 x i8> %v) { +; CHECK-LABEL: extend_low_i8x16_u: +; CHECK: .functype extend_low_i8x16_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.widen_low_i8x16_u +; CHECK-NEXT: i16x8.extend_low_i8x16_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> - %widened = zext <8 x i8> %low to <8 x i16> - ret <8 x i16> %widened + %extended = zext <8 x i8> %low to <8 x i16> + ret <8 x i16> %extended } -define <8 x i16> @widen_high_i8x16_s(<16 x i8> %v) { -; CHECK-LABEL: widen_high_i8x16_s: -; CHECK: .functype widen_high_i8x16_s (v128) -> (v128) +define <8 x i16> @extend_high_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: extend_high_i8x16_s: +; CHECK: .functype extend_high_i8x16_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.widen_high_i8x16_s +; CHECK-NEXT: i16x8.extend_high_i8x16_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> - %widened = sext <8 x i8> %low to <8 x i16> - ret <8 x i16> %widened + %extended = sext <8 x i8> %low to <8 x i16> + ret <8 x i16> %extended } -define <8 x i16> @widen_high_i8x16_u(<16 x i8> %v) { -; CHECK-LABEL: widen_high_i8x16_u: -; CHECK: .functype widen_high_i8x16_u (v128) -> (v128) +define <8 x i16> @extend_high_i8x16_u(<16 x i8> %v) { +; CHECK-LABEL: extend_high_i8x16_u: +; CHECK: .functype extend_high_i8x16_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i16x8.widen_high_i8x16_u +; CHECK-NEXT: i16x8.extend_high_i8x16_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> - %widened = zext <8 x i8> %low to <8 x i16> - ret <8 x i16> %widened + %extended = zext <8 x i8> %low to <8 x i16> + ret <8 x i16> %extended } -define <4 x i32> @widen_low_i16x8_s(<8 x i16> %v) { -; CHECK-LABEL: widen_low_i16x8_s: -; CHECK: .functype widen_low_i16x8_s (v128) -> (v128) +define <4 x i32> @extend_low_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: extend_low_i16x8_s: +; CHECK: .functype extend_low_i16x8_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.widen_low_i16x8_s +; CHECK-NEXT: i32x4.extend_low_i16x8_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> - %widened = sext <4 x i16> %low to <4 x i32> - ret <4 x i32> %widened + %extended = sext <4 x i16> %low to <4 x i32> + ret <4 x i32> %extended } -define <4 x i32> @widen_low_i16x8_u(<8 x i16> %v) { -; CHECK-LABEL: widen_low_i16x8_u: -; CHECK: .functype widen_low_i16x8_u (v128) -> (v128) +define <4 x i32> @extend_low_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: extend_low_i16x8_u: +; CHECK: .functype extend_low_i16x8_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.widen_low_i16x8_u +; CHECK-NEXT: i32x4.extend_low_i16x8_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> - %widened = zext <4 x i16> %low to <4 x i32> - ret <4 x i32> %widened + %extended = zext <4 x i16> %low to <4 x i32> + ret <4 x i32> %extended } -define <4 x i32> @widen_high_i16x8_s(<8 x i16> %v) { -; CHECK-LABEL: widen_high_i16x8_s: -; CHECK: .functype widen_high_i16x8_s (v128) -> (v128) +define <4 x i32> @extend_high_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: extend_high_i16x8_s: +; CHECK: .functype extend_high_i16x8_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.widen_high_i16x8_s +; CHECK-NEXT: i32x4.extend_high_i16x8_s ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> - %widened = sext <4 x i16> %low to <4 x i32> - ret <4 x i32> %widened + %extended = sext <4 x i16> %low to <4 x i32> + ret <4 x i32> %extended } -define <4 x i32> @widen_high_i16x8_u(<8 x i16> %v) { -; CHECK-LABEL: widen_high_i16x8_u: -; CHECK: .functype widen_high_i16x8_u (v128) -> (v128) +define <4 x i32> @extend_high_i16x8_u(<8 x i16> %v) { +; CHECK-LABEL: extend_high_i16x8_u: +; CHECK: .functype extend_high_i16x8_u (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 -; CHECK-NEXT: i32x4.widen_high_i16x8_u +; CHECK-NEXT: i32x4.extend_high_i16x8_u ; CHECK-NEXT: # fallthrough-return %low = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> - %widened = zext <4 x i16> %low to <4 x i32> - ret <4 x i32> %widened + %extended = zext <4 x i16> %low to <4 x i32> + ret <4 x i32> %extended } ;; Also test that similar patterns with offsets not corresponding to ;; the low or high half are correctly expanded. -define <8 x i16> @widen_lowish_i8x16_s(<16 x i8> %v) { -; CHECK-LABEL: widen_lowish_i8x16_s: -; CHECK: .functype widen_lowish_i8x16_s (v128) -> (v128) +define <8 x i16> @extend_lowish_i8x16_s(<16 x i8> %v) { +; CHECK-LABEL: extend_lowish_i8x16_s: +; CHECK: .functype extend_lowish_i8x16_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i8x16.extract_lane_u 1 @@ -148,13 +148,13 @@ define <8 x i16> @widen_lowish_i8x16_s(<16 x i8> %v) { ; CHECK-NEXT: # fallthrough-return %lowish = shufflevector <16 x i8> %v, <16 x i8> undef, <8 x i32> - %widened = sext <8 x i8> %lowish to <8 x i16> - ret <8 x i16> %widened + %extended = sext <8 x i8> %lowish to <8 x i16> + ret <8 x i16> %extended } -define <4 x i32> @widen_lowish_i16x8_s(<8 x i16> %v) { -; CHECK-LABEL: widen_lowish_i16x8_s: -; CHECK: .functype widen_lowish_i16x8_s (v128) -> (v128) +define <4 x i32> @extend_lowish_i16x8_s(<8 x i16> %v) { +; CHECK-LABEL: extend_lowish_i16x8_s: +; CHECK: .functype extend_lowish_i16x8_s (v128) -> (v128) ; CHECK-NEXT: # %bb.0: ; CHECK-NEXT: local.get 0 ; CHECK-NEXT: i16x8.extract_lane_u 1 @@ -175,6 +175,6 @@ define <4 x i32> @widen_lowish_i16x8_s(<8 x i16> %v) { ; CHECK-NEXT: # fallthrough-return %lowish = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> - %widened = sext <4 x i16> %lowish to <4 x i32> - ret <4 x i32> %widened + %extended = sext <4 x i16> %lowish to <4 x i32> + ret <4 x i32> %extended } diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll index 606b8b6753d1..d2fdde3fcb3c 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -23,7 +23,7 @@ define <16 x i8> @swizzle_v16i8(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_sat_s_v16i8: ; CHECK-NEXT: .functype add_sat_s_v16i8 (v128, v128) -> (v128){{$}} -; CHECK-NEXT: i8x16.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: i8x16.add_sat_s $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) { @@ -33,7 +33,7 @@ define <16 x i8> @add_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: add_sat_u_v16i8: ; CHECK-NEXT: .functype add_sat_u_v16i8 (v128, v128) -> (v128){{$}} -; CHECK-NEXT: i8x16.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: i8x16.add_sat_u $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} declare <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @add_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) { @@ -43,11 +43,11 @@ define <16 x i8> @add_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: sub_sat_s_v16i8: ; CHECK-NEXT: .functype sub_sat_s_v16i8 (v128, v128) -> (v128){{$}} -; CHECK-NEXT: i8x16.sub_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: i8x16.sub_sat_s $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <16 x i8> @llvm.wasm.sub.saturate.signed.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.wasm.sub.sat.signed.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @sub_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) { - %a = call <16 x i8> @llvm.wasm.sub.saturate.signed.v16i8( + %a = call <16 x i8> @llvm.wasm.sub.sat.signed.v16i8( <16 x i8> %x, <16 x i8> %y ) ret <16 x i8> %a @@ -55,11 +55,11 @@ define <16 x i8> @sub_sat_s_v16i8(<16 x i8> %x, <16 x i8> %y) { ; CHECK-LABEL: sub_sat_u_v16i8: ; CHECK-NEXT: .functype sub_sat_u_v16i8 (v128, v128) -> (v128){{$}} -; CHECK-NEXT: i8x16.sub_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: i8x16.sub_sat_u $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <16 x i8> @llvm.wasm.sub.saturate.unsigned.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8(<16 x i8>, <16 x i8>) define <16 x i8> @sub_sat_u_v16i8(<16 x i8> %x, <16 x i8> %y) { - %a = call <16 x i8> @llvm.wasm.sub.saturate.unsigned.v16i8( + %a = call <16 x i8> @llvm.wasm.sub.sat.unsigned.v16i8( <16 x i8> %x, <16 x i8> %y ) ret <16 x i8> %a @@ -186,7 +186,7 @@ define <16 x i8> @shuffle_undef_v16i8(<16 x i8> %x, <16 x i8> %y) { ; ============================================================================== ; CHECK-LABEL: add_sat_s_v8i16: ; CHECK-NEXT: .functype add_sat_s_v8i16 (v128, v128) -> (v128){{$}} -; CHECK-NEXT: i16x8.add_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: i16x8.add_sat_s $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { @@ -196,7 +196,7 @@ define <8 x i16> @add_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: add_sat_u_v8i16: ; CHECK-NEXT: .functype add_sat_u_v8i16 (v128, v128) -> (v128){{$}} -; CHECK-NEXT: i16x8.add_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: i16x8.add_sat_u $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} declare <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @add_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) { @@ -206,11 +206,11 @@ define <8 x i16> @add_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: sub_sat_s_v8i16: ; CHECK-NEXT: .functype sub_sat_s_v8i16 (v128, v128) -> (v128){{$}} -; CHECK-NEXT: i16x8.sub_saturate_s $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: i16x8.sub_sat_s $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.sub.saturate.signed.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.wasm.sub.sat.signed.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @sub_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { - %a = call <8 x i16> @llvm.wasm.sub.saturate.signed.v8i16( + %a = call <8 x i16> @llvm.wasm.sub.sat.signed.v8i16( <8 x i16> %x, <8 x i16> %y ) ret <8 x i16> %a @@ -218,11 +218,11 @@ define <8 x i16> @sub_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { ; CHECK-LABEL: sub_sat_u_v8i16: ; CHECK-NEXT: .functype sub_sat_u_v8i16 (v128, v128) -> (v128){{$}} -; CHECK-NEXT: i16x8.sub_saturate_u $push[[R:[0-9]+]]=, $0, $1{{$}} +; CHECK-NEXT: i16x8.sub_sat_u $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.sub.saturate.unsigned.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16(<8 x i16>, <8 x i16>) define <8 x i16> @sub_sat_u_v8i16(<8 x i16> %x, <8 x i16> %y) { - %a = call <8 x i16> @llvm.wasm.sub.saturate.unsigned.v8i16( + %a = call <8 x i16> @llvm.wasm.sub.sat.unsigned.v8i16( <8 x i16> %x, <8 x i16> %y ) ret <8 x i16> %a @@ -242,9 +242,9 @@ define <8 x i16> @avgr_u_v8i16(<8 x i16> %x, <8 x i16> %y) { ; CHECK-NEXT: .functype q15mulr_sat_s_v8i16 (v128, v128) -> (v128){{$}} ; CHECK-NEXT: i16x8.q15mulr_sat_s $push[[R:[0-9]+]]=, $0, $1{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <8 x i16> @llvm.wasm.q15mulr.saturate.signed(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16>, <8 x i16>) define <8 x i16> @q15mulr_sat_s_v8i16(<8 x i16> %x, <8 x i16> %y) { - %a = call <8 x i16> @llvm.wasm.q15mulr.saturate.signed(<8 x i16> %x, + %a = call <8 x i16> @llvm.wasm.q15mulr.sat.signed(<8 x i16> %x, <8 x i16> %y) ret <8 x i16> %a } @@ -534,9 +534,9 @@ define <4 x i32> @trunc_sat_u_v4i32(<4 x float> %x) { ; CHECK-NEXT: .functype trunc_sat_zero_signed_v4i32 (v128) -> (v128){{$}} ; CHECK-NEXT: i32x4.trunc_sat_zero_f64x2_s $push[[R:[0-9]+]]=, $0{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.trunc.saturate.zero.signed(<2 x double>) +declare <4 x i32> @llvm.wasm.trunc.sat.zero.signed(<2 x double>) define <4 x i32> @trunc_sat_zero_signed_v4i32(<2 x double> %a) { - %v = call <4 x i32> @llvm.wasm.trunc.saturate.zero.signed(<2 x double> %a) + %v = call <4 x i32> @llvm.wasm.trunc.sat.zero.signed(<2 x double> %a) ret <4 x i32> %v } @@ -544,9 +544,9 @@ define <4 x i32> @trunc_sat_zero_signed_v4i32(<2 x double> %a) { ; CHECK-NEXT: .functype trunc_sat_zero_unsigned_v4i32 (v128) -> (v128){{$}} ; CHECK-NEXT: i32x4.trunc_sat_zero_f64x2_u $push[[R:[0-9]+]]=, $0{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <4 x i32> @llvm.wasm.trunc.saturate.zero.unsigned(<2 x double>) +declare <4 x i32> @llvm.wasm.trunc.sat.zero.unsigned(<2 x double>) define <4 x i32> @trunc_sat_zero_unsigned_v4i32(<2 x double> %a) { - %v = call <4 x i32> @llvm.wasm.trunc.saturate.zero.unsigned(<2 x double> %a) + %v = call <4 x i32> @llvm.wasm.trunc.sat.zero.unsigned(<2 x double> %a) ret <4 x i32> %v } @@ -563,43 +563,43 @@ define <2 x i64> @eq_v2i64(<2 x i64> %x, <2 x i64> %y) { ret <2 x i64> %a } -; CHECK-LABEL: widen_low_s_v2i64: -; CHECK-NEXT: .functype widen_low_s_v2i64 (v128) -> (v128){{$}} -; CHECK-NEXT: i64x2.widen_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-LABEL: extend_low_s_v2i64: +; CHECK-NEXT: .functype extend_low_s_v2i64 (v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.extend_low_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <2 x i64> @llvm.wasm.widen.low.signed(<4 x i32>) -define <2 x i64> @widen_low_s_v2i64(<4 x i32> %x) { - %a = call <2 x i64> @llvm.wasm.widen.low.signed(<4 x i32> %x) +declare <2 x i64> @llvm.wasm.extend.low.signed(<4 x i32>) +define <2 x i64> @extend_low_s_v2i64(<4 x i32> %x) { + %a = call <2 x i64> @llvm.wasm.extend.low.signed(<4 x i32> %x) ret <2 x i64> %a } -; CHECK-LABEL: widen_high_s_v2i64: -; CHECK-NEXT: .functype widen_high_s_v2i64 (v128) -> (v128){{$}} -; CHECK-NEXT: i64x2.widen_high_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-LABEL: extend_high_s_v2i64: +; CHECK-NEXT: .functype extend_high_s_v2i64 (v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.extend_high_i32x4_s $push[[R:[0-9]+]]=, $0{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <2 x i64> @llvm.wasm.widen.high.signed(<4 x i32>) -define <2 x i64> @widen_high_s_v2i64(<4 x i32> %x) { - %a = call <2 x i64> @llvm.wasm.widen.high.signed(<4 x i32> %x) +declare <2 x i64> @llvm.wasm.extend.high.signed(<4 x i32>) +define <2 x i64> @extend_high_s_v2i64(<4 x i32> %x) { + %a = call <2 x i64> @llvm.wasm.extend.high.signed(<4 x i32> %x) ret <2 x i64> %a } -; CHECK-LABEL: widen_low_u_v2i64: -; CHECK-NEXT: .functype widen_low_u_v2i64 (v128) -> (v128){{$}} -; CHECK-NEXT: i64x2.widen_low_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-LABEL: extend_low_u_v2i64: +; CHECK-NEXT: .functype extend_low_u_v2i64 (v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.extend_low_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <2 x i64> @llvm.wasm.widen.low.unsigned(<4 x i32>) -define <2 x i64> @widen_low_u_v2i64(<4 x i32> %x) { - %a = call <2 x i64> @llvm.wasm.widen.low.unsigned(<4 x i32> %x) +declare <2 x i64> @llvm.wasm.extend.low.unsigned(<4 x i32>) +define <2 x i64> @extend_low_u_v2i64(<4 x i32> %x) { + %a = call <2 x i64> @llvm.wasm.extend.low.unsigned(<4 x i32> %x) ret <2 x i64> %a } -; CHECK-LABEL: widen_high_u_v2i64: -; CHECK-NEXT: .functype widen_high_u_v2i64 (v128) -> (v128){{$}} -; CHECK-NEXT: i64x2.widen_high_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} +; CHECK-LABEL: extend_high_u_v2i64: +; CHECK-NEXT: .functype extend_high_u_v2i64 (v128) -> (v128){{$}} +; CHECK-NEXT: i64x2.extend_high_i32x4_u $push[[R:[0-9]+]]=, $0{{$}} ; CHECK-NEXT: return $pop[[R]]{{$}} -declare <2 x i64> @llvm.wasm.widen.high.unsigned(<4 x i32>) -define <2 x i64> @widen_high_u_v2i64(<4 x i32> %x) { - %a = call <2 x i64> @llvm.wasm.widen.high.unsigned(<4 x i32> %x) +declare <2 x i64> @llvm.wasm.extend.high.unsigned(<4 x i32>) +define <2 x i64> @extend_high_u_v2i64(<4 x i32> %x) { + %a = call <2 x i64> @llvm.wasm.extend.high.unsigned(<4 x i32> %x) ret <2 x i64> %a } diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index f9f4a553a63d..1a687468487a 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -280,38 +280,51 @@ main: # CHECK: v128.bitselect # encoding: [0xfd,0x52] v128.bitselect - # CHECK: v128.load8_lane 32, 1 # encoding: [0xfd,0x58,0x00,0x20,0x01] + # TODO: v128.any_true # encoding: [0xfd,0x53] + + # CHECK: v128.load8_lane 32, 1 # encoding: [0xfd,0x54,0x00,0x20,0x01] v128.load8_lane 32, 1 - # CHECK: v128.load16_lane 32, 1 # encoding: [0xfd,0x59,0x01,0x20,0x01] + # CHECK: v128.load16_lane 32, 1 # encoding: [0xfd,0x55,0x01,0x20,0x01] v128.load16_lane 32, 1 - # CHECK: v128.load32_lane 32, 1 # encoding: [0xfd,0x5a,0x02,0x20,0x01] + # CHECK: v128.load32_lane 32, 1 # encoding: [0xfd,0x56,0x02,0x20,0x01] v128.load32_lane 32, 1 - # CHECK: v128.load64_lane 32, 1 # encoding: [0xfd,0x5b,0x03,0x20,0x01] + # CHECK: v128.load64_lane 32, 1 # encoding: [0xfd,0x57,0x03,0x20,0x01] v128.load64_lane 32, 1 - # CHECK: v128.store8_lane 32, 1 # encoding: [0xfd,0x5c,0x00,0x20,0x01] + # CHECK: v128.store8_lane 32, 1 # encoding: [0xfd,0x58,0x00,0x20,0x01] v128.store8_lane 32, 1 - # CHECK: v128.store16_lane 32, 1 # encoding: [0xfd,0x5d,0x01,0x20,0x01] + # CHECK: v128.store16_lane 32, 1 # encoding: [0xfd,0x59,0x01,0x20,0x01] v128.store16_lane 32, 1 - # CHECK: v128.store32_lane 32, 1 # encoding: [0xfd,0x5e,0x02,0x20,0x01] + # CHECK: v128.store32_lane 32, 1 # encoding: [0xfd,0x5a,0x02,0x20,0x01] v128.store32_lane 32, 1 - # CHECK: v128.store64_lane 32, 1 # encoding: [0xfd,0x5f,0x03,0x20,0x01] + # CHECK: v128.store64_lane 32, 1 # encoding: [0xfd,0x5b,0x03,0x20,0x01] v128.store64_lane 32, 1 + # CHECK: v128.load32_zero 32 # encoding: [0xfd,0x5c,0x02,0x20] + v128.load32_zero 32 + + # CHECK: v128.load64_zero 32 # encoding: [0xfd,0x5d,0x03,0x20] + v128.load64_zero 32 + + # CHECK: f32x4.demote_zero_f64x2 # encoding: [0xfd,0x5e] + f32x4.demote_zero_f64x2 + + # CHECK: f64x2.promote_low_f32x4 # encoding: [0xfd,0x5f] + f64x2.promote_low_f32x4 + # CHECK: i8x16.abs # encoding: [0xfd,0x60] i8x16.abs # CHECK: i8x16.neg # encoding: [0xfd,0x61] i8x16.neg - # CHECK: i8x16.any_true # encoding: [0xfd,0x62] - i8x16.any_true + # TODO: i8x16.popcnt # encoding: [0xfd,0x62] # CHECK: i8x16.all_true # encoding: [0xfd,0x63] i8x16.all_true @@ -325,6 +338,18 @@ main: # CHECK: i8x16.narrow_i16x8_u # encoding: [0xfd,0x66] i8x16.narrow_i16x8_u + # CHECK: f32x4.ceil # encoding: [0xfd,0x67] + f32x4.ceil + + # CHECK: f32x4.floor # encoding: [0xfd,0x68] + f32x4.floor + + # CHECK: f32x4.trunc # encoding: [0xfd,0x69] + f32x4.trunc + + # CHECK: f32x4.nearest # encoding: [0xfd,0x6a] + f32x4.nearest + # CHECK: i8x16.shl # encoding: [0xfd,0x6b] i8x16.shl @@ -337,20 +362,26 @@ main: # CHECK: i8x16.add # encoding: [0xfd,0x6e] i8x16.add - # CHECK: i8x16.add_saturate_s # encoding: [0xfd,0x6f] - i8x16.add_saturate_s + # CHECK: i8x16.add_sat_s # encoding: [0xfd,0x6f] + i8x16.add_sat_s - # CHECK: i8x16.add_saturate_u # encoding: [0xfd,0x70] - i8x16.add_saturate_u + # CHECK: i8x16.add_sat_u # encoding: [0xfd,0x70] + i8x16.add_sat_u # CHECK: i8x16.sub # encoding: [0xfd,0x71] i8x16.sub - # CHECK: i8x16.sub_saturate_s # encoding: [0xfd,0x72] - i8x16.sub_saturate_s + # CHECK: i8x16.sub_sat_s # encoding: [0xfd,0x72] + i8x16.sub_sat_s - # CHECK: i8x16.sub_saturate_u # encoding: [0xfd,0x73] - i8x16.sub_saturate_u + # CHECK: i8x16.sub_sat_u # encoding: [0xfd,0x73] + i8x16.sub_sat_u + + # CHECK: f64x2.ceil # encoding: [0xfd,0x74] + f64x2.ceil + + # CHECK: f64x2.floor # encoding: [0xfd,0x75] + f64x2.floor # CHECK: i8x16.min_s # encoding: [0xfd,0x76] i8x16.min_s @@ -364,11 +395,23 @@ main: # CHECK: i8x16.max_u # encoding: [0xfd,0x79] i8x16.max_u + # CHECK: f64x2.trunc # encoding: [0xfd,0x7a] + f64x2.trunc + # CHECK: i8x16.avgr_u # encoding: [0xfd,0x7b] i8x16.avgr_u - # CHECK: i8x16.popcnt # encoding: [0xfd,0x7c] - i8x16.popcnt + # CHECK: i16x8.extadd_pairwise_i8x16_s # encoding: [0xfd,0x7c] + i16x8.extadd_pairwise_i8x16_s + + # CHECK: i16x8.extadd_pairwise_i8x16_u # encoding: [0xfd,0x7d] + i16x8.extadd_pairwise_i8x16_u + + # CHECK: i32x4.extadd_pairwise_i16x8_s # encoding: [0xfd,0x7e] + i32x4.extadd_pairwise_i16x8_s + + # CHECK: i32x4.extadd_pairwise_i16x8_u # encoding: [0xfd,0x7f] + i32x4.extadd_pairwise_i16x8_u # CHECK: i16x8.abs # encoding: [0xfd,0x80,0x01] i16x8.abs @@ -376,8 +419,8 @@ main: # CHECK: i16x8.neg # encoding: [0xfd,0x81,0x01] i16x8.neg - # CHECK: i16x8.any_true # encoding: [0xfd,0x82,0x01] - i16x8.any_true + # CHECK: i16x8.q15mulr_sat_s # encoding: [0xfd,0x82,0x01] + i16x8.q15mulr_sat_s # CHECK: i16x8.all_true # encoding: [0xfd,0x83,0x01] i16x8.all_true @@ -391,17 +434,17 @@ main: # CHECK: i16x8.narrow_i32x4_u # encoding: [0xfd,0x86,0x01] i16x8.narrow_i32x4_u - # CHECK: i16x8.widen_low_i8x16_s # encoding: [0xfd,0x87,0x01] - i16x8.widen_low_i8x16_s + # CHECK: i16x8.extend_low_i8x16_s # encoding: [0xfd,0x87,0x01] + i16x8.extend_low_i8x16_s - # CHECK: i16x8.widen_high_i8x16_s # encoding: [0xfd,0x88,0x01] - i16x8.widen_high_i8x16_s + # CHECK: i16x8.extend_high_i8x16_s # encoding: [0xfd,0x88,0x01] + i16x8.extend_high_i8x16_s - # CHECK: i16x8.widen_low_i8x16_u # encoding: [0xfd,0x89,0x01] - i16x8.widen_low_i8x16_u + # CHECK: i16x8.extend_low_i8x16_u # encoding: [0xfd,0x89,0x01] + i16x8.extend_low_i8x16_u - # CHECK: i16x8.widen_high_i8x16_u # encoding: [0xfd,0x8a,0x01] - i16x8.widen_high_i8x16_u + # CHECK: i16x8.extend_high_i8x16_u # encoding: [0xfd,0x8a,0x01] + i16x8.extend_high_i8x16_u # CHECK: i16x8.shl # encoding: [0xfd,0x8b,0x01] i16x8.shl @@ -415,20 +458,23 @@ main: # CHECK: i16x8.add # encoding: [0xfd,0x8e,0x01] i16x8.add - # CHECK: i16x8.add_saturate_s # encoding: [0xfd,0x8f,0x01] - i16x8.add_saturate_s + # CHECK: i16x8.add_sat_s # encoding: [0xfd,0x8f,0x01] + i16x8.add_sat_s - # CHECK: i16x8.add_saturate_u # encoding: [0xfd,0x90,0x01] - i16x8.add_saturate_u + # CHECK: i16x8.add_sat_u # encoding: [0xfd,0x90,0x01] + i16x8.add_sat_u # CHECK: i16x8.sub # encoding: [0xfd,0x91,0x01] i16x8.sub - # CHECK: i16x8.sub_saturate_s # encoding: [0xfd,0x92,0x01] - i16x8.sub_saturate_s + # CHECK: i16x8.sub_sat_s # encoding: [0xfd,0x92,0x01] + i16x8.sub_sat_s - # CHECK: i16x8.sub_saturate_u # encoding: [0xfd,0x93,0x01] - i16x8.sub_saturate_u + # CHECK: i16x8.sub_sat_u # encoding: [0xfd,0x93,0x01] + i16x8.sub_sat_u + + # CHECK: f64x2.nearest # encoding: [0xfd,0x94,0x01] + f64x2.nearest # CHECK: i16x8.mul # encoding: [0xfd,0x95,0x01] i16x8.mul @@ -445,11 +491,22 @@ main: # CHECK: i16x8.max_u # encoding: [0xfd,0x99,0x01] i16x8.max_u + # 0x0a unused + # CHECK: i16x8.avgr_u # encoding: [0xfd,0x9b,0x01] i16x8.avgr_u - # CHECK: i16x8.q15mulr_sat_s # encoding: [0xfd,0x9c,0x01] - i16x8.q15mulr_sat_s + # CHECK: i16x8.extmul_low_i8x16_s # encoding: [0xfd,0x9c,0x01] + i16x8.extmul_low_i8x16_s + + # CHECK: i16x8.extmul_high_i8x16_s # encoding: [0xfd,0x9d,0x01] + i16x8.extmul_high_i8x16_s + + # CHECK: i16x8.extmul_low_i8x16_u # encoding: [0xfd,0x9e,0x01] + i16x8.extmul_low_i8x16_u + + # CHECK: i16x8.extmul_high_i8x16_u # encoding: [0xfd,0x9f,0x01] + i16x8.extmul_high_i8x16_u # CHECK: i32x4.abs # encoding: [0xfd,0xa0,0x01] i32x4.abs @@ -457,8 +514,7 @@ main: # CHECK: i32x4.neg # encoding: [0xfd,0xa1,0x01] i32x4.neg - # CHECK: i32x4.any_true # encoding: [0xfd,0xa2,0x01] - i32x4.any_true + # 0xa2 unused # CHECK: i32x4.all_true # encoding: [0xfd,0xa3,0x01] i32x4.all_true @@ -466,17 +522,21 @@ main: # CHECK: i32x4.bitmask # encoding: [0xfd,0xa4,0x01] i32x4.bitmask - # CHECK: i32x4.widen_low_i16x8_s # encoding: [0xfd,0xa7,0x01] - i32x4.widen_low_i16x8_s + # 0xa5 unused + + # 0xa6 unused - # CHECK: i32x4.widen_high_i16x8_s # encoding: [0xfd,0xa8,0x01] - i32x4.widen_high_i16x8_s + # CHECK: i32x4.extend_low_i16x8_s # encoding: [0xfd,0xa7,0x01] + i32x4.extend_low_i16x8_s - # CHECK: i32x4.widen_low_i16x8_u # encoding: [0xfd,0xa9,0x01] - i32x4.widen_low_i16x8_u + # CHECK: i32x4.extend_high_i16x8_s # encoding: [0xfd,0xa8,0x01] + i32x4.extend_high_i16x8_s - # CHECK: i32x4.widen_high_i16x8_u # encoding: [0xfd,0xaa,0x01] - i32x4.widen_high_i16x8_u + # CHECK: i32x4.extend_low_i16x8_u # encoding: [0xfd,0xa9,0x01] + i32x4.extend_low_i16x8_u + + # CHECK: i32x4.extend_high_i16x8_u # encoding: [0xfd,0xaa,0x01] + i32x4.extend_high_i16x8_u # CHECK: i32x4.shl # encoding: [0xfd,0xab,0x01] i32x4.shl @@ -490,9 +550,19 @@ main: # CHECK: i32x4.add # encoding: [0xfd,0xae,0x01] i32x4.add + # 0xaf unused + + # 0xb0 unused + # CHECK: i32x4.sub # encoding: [0xfd,0xb1,0x01] i32x4.sub + # 0xb2 unused + + # 0xb3 unused + + # 0xb4 unused + # CHECK: i32x4.mul # encoding: [0xfd,0xb5,0x01] i32x4.mul @@ -511,14 +581,26 @@ main: # CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xba,0x01] i32x4.dot_i16x8_s - # CHECK: i64x2.eq # encoding: [0xfd,0xc0,0x01] - i64x2.eq + # 0xbb unused + + # CHECK: i32x4.extmul_low_i16x8_s # encoding: [0xfd,0xbc,0x01] + i32x4.extmul_low_i16x8_s + + # CHECK: i32x4.extmul_high_i16x8_s # encoding: [0xfd,0xbd,0x01] + i32x4.extmul_high_i16x8_s + + # CHECK: i32x4.extmul_low_i16x8_u # encoding: [0xfd,0xbe,0x01] + i32x4.extmul_low_i16x8_u + + # CHECK: i32x4.extmul_high_i16x8_u # encoding: [0xfd,0xbf,0x01] + i32x4.extmul_high_i16x8_u + + # TODO: i64x2.abs # encoding: [0xfd,0xc0,0x01] # CHECK: i64x2.neg # encoding: [0xfd,0xc1,0x01] i64x2.neg - # CHECK: i64x2.any_true # encoding: [0xfd,0xc2,0x01] - i64x2.any_true + # 0xc2 unused # CHECK: i64x2.all_true # encoding: [0xfd,0xc3,0x01] i64x2.all_true @@ -526,17 +608,21 @@ main: # CHECK: i64x2.bitmask # encoding: [0xfd,0xc4,0x01] i64x2.bitmask - # CHECK: i64x2.widen_low_i32x4_s # encoding: [0xfd,0xc7,0x01] - i64x2.widen_low_i32x4_s + # 0xc5 unused + + # 0xc6 unused - # CHECK: i64x2.widen_high_i32x4_s # encoding: [0xfd,0xc8,0x01] - i64x2.widen_high_i32x4_s + # CHECK: i64x2.extend_low_i32x4_s # encoding: [0xfd,0xc7,0x01] + i64x2.extend_low_i32x4_s - # CHECK: i64x2.widen_low_i32x4_u # encoding: [0xfd,0xc9,0x01] - i64x2.widen_low_i32x4_u + # CHECK: i64x2.extend_high_i32x4_s # encoding: [0xfd,0xc8,0x01] + i64x2.extend_high_i32x4_s - # CHECK: i64x2.widen_high_i32x4_u # encoding: [0xfd,0xca,0x01] - i64x2.widen_high_i32x4_u + # CHECK: i64x2.extend_low_i32x4_u # encoding: [0xfd,0xc9,0x01] + i64x2.extend_low_i32x4_u + + # CHECK: i64x2.extend_high_i32x4_u # encoding: [0xfd,0xca,0x01] + i64x2.extend_high_i32x4_u # CHECK: i64x2.shl # encoding: [0xfd,0xcb,0x01] i64x2.shl @@ -550,35 +636,45 @@ main: # CHECK: i64x2.add # encoding: [0xfd,0xce,0x01] i64x2.add + # 0xcf unused + + # 0xd0 unused + # CHECK: i64x2.sub # encoding: [0xfd,0xd1,0x01] i64x2.sub + # 0xd2 unused + + # 0xd3 unused + + # 0xd4 unused + # CHECK: i64x2.mul # encoding: [0xfd,0xd5,0x01] i64x2.mul - # CHECK: f32x4.ceil # encoding: [0xfd,0xd8,0x01] - f32x4.ceil + # TODO: i64x2.eq # encoding: [0xfd,0xd6,0x01] - # CHECK: f32x4.floor # encoding: [0xfd,0xd9,0x01] - f32x4.floor + # TODO: i64x2.ne # encoding: [0xfd,0xd7,0x01] - # CHECK: f32x4.trunc # encoding: [0xfd,0xda,0x01] - f32x4.trunc + # TODO: i64x2.lt_s # encoding: [0xfd,0xd8,0x01] - # CHECK: f32x4.nearest # encoding: [0xfd,0xdb,0x01] - f32x4.nearest + # TODO: i64x2.gt_s # encoding: [0xfd,0xd9,0x01] - # CHECK: f64x2.ceil # encoding: [0xfd,0xdc,0x01] - f64x2.ceil + # TODO: i64x2.le_s # encoding: [0xfd,0xda,0x01] - # CHECK: f64x2.floor # encoding: [0xfd,0xdd,0x01] - f64x2.floor + # TODO: i64x2.ge_s # encoding: [0xfd,0xdb,0x01] - # CHECK: f64x2.trunc # encoding: [0xfd,0xde,0x01] - f64x2.trunc + # CHECK: i64x2.extmul_low_i32x4_s # encoding: [0xfd,0xdc,0x01] + i64x2.extmul_low_i32x4_s - # CHECK: f64x2.nearest # encoding: [0xfd,0xdf,0x01] - f64x2.nearest + # CHECK: i64x2.extmul_high_i32x4_s # encoding: [0xfd,0xdd,0x01] + i64x2.extmul_high_i32x4_s + + # CHECK: i64x2.extmul_low_i32x4_u # encoding: [0xfd,0xde,0x01] + i64x2.extmul_low_i32x4_u + + # CHECK: i64x2.extmul_high_i32x4_u # encoding: [0xfd,0xdf,0x01] + i64x2.extmul_high_i32x4_u # CHECK: f32x4.abs # encoding: [0xfd,0xe0,0x01] f32x4.abs @@ -586,6 +682,8 @@ main: # CHECK: f32x4.neg # encoding: [0xfd,0xe1,0x01] f32x4.neg + # 0xe2 unused + # CHECK: f32x4.sqrt # encoding: [0xfd,0xe3,0x01] f32x4.sqrt @@ -619,6 +717,8 @@ main: # CHECK: f64x2.neg # encoding: [0xfd,0xed,0x01] f64x2.neg + # 0xee unused + # CHECK: f64x2.sqrt # encoding: [0xfd,0xef,0x01] f64x2.sqrt @@ -658,76 +758,16 @@ main: # CHECK: f32x4.convert_i32x4_u # encoding: [0xfd,0xfb,0x01] f32x4.convert_i32x4_u - # CHECK: v128.load32_zero 32 # encoding: [0xfd,0xfc,0x01,0x02,0x20] - v128.load32_zero 32 - - # CHECK: v128.load64_zero 32 # encoding: [0xfd,0xfd,0x01,0x03,0x20] - v128.load64_zero 32 - - # CHECK: i16x8.extmul_low_i8x16_s # encoding: [0xfd,0x9a,0x01] - i16x8.extmul_low_i8x16_s - - # CHECK: i16x8.extmul_high_i8x16_s # encoding: [0xfd,0x9d,0x01] - i16x8.extmul_high_i8x16_s - - # CHECK: i16x8.extmul_low_i8x16_u # encoding: [0xfd,0x9e,0x01] - i16x8.extmul_low_i8x16_u - - # CHECK: i16x8.extmul_high_i8x16_u # encoding: [0xfd,0x9f,0x01] - i16x8.extmul_high_i8x16_u - - # CHECK: i32x4.extmul_low_i16x8_s # encoding: [0xfd,0xbb,0x01] - i32x4.extmul_low_i16x8_s - - # CHECK: i32x4.extmul_high_i16x8_s # encoding: [0xfd,0xbd,0x01] - i32x4.extmul_high_i16x8_s - - # CHECK: i32x4.extmul_low_i16x8_u # encoding: [0xfd,0xbe,0x01] - i32x4.extmul_low_i16x8_u - - # CHECK: i32x4.extmul_high_i16x8_u # encoding: [0xfd,0xbf,0x01] - i32x4.extmul_high_i16x8_u - - # CHECK: i64x2.extmul_low_i32x4_s # encoding: [0xfd,0xd2,0x01] - i64x2.extmul_low_i32x4_s - - # CHECK: i64x2.extmul_high_i32x4_s # encoding: [0xfd,0xd3,0x01] - i64x2.extmul_high_i32x4_s - - # CHECK: i64x2.extmul_low_i32x4_u # encoding: [0xfd,0xd6,0x01] - i64x2.extmul_low_i32x4_u - - # CHECK: i64x2.extmul_high_i32x4_u # encoding: [0xfd,0xd7,0x01] - i64x2.extmul_high_i32x4_u - - # CHECK: i16x8.extadd_pairwise_i8x16_s # encoding: [0xfd,0xc2,0x01] - i16x8.extadd_pairwise_i8x16_s - - # CHECK: i16x8.extadd_pairwise_i8x16_u # encoding: [0xfd,0xc3,0x01] - i16x8.extadd_pairwise_i8x16_u - - # CHECK: i32x4.extadd_pairwise_i16x8_s # encoding: [0xfd,0xa5,0x01] - i32x4.extadd_pairwise_i16x8_s - - # CHECK: i32x4.extadd_pairwise_i16x8_u # encoding: [0xfd,0xa6,0x01] - i32x4.extadd_pairwise_i16x8_u - - # CHECK: f64x2.convert_low_i32x4_s # encoding: [0xfd,0x53] - f64x2.convert_low_i32x4_s - - # CHECK: f64x2.convert_low_i32x4_u # encoding: [0xfd,0x54] - f64x2.convert_low_i32x4_u - - # CHECK: i32x4.trunc_sat_zero_f64x2_s # encoding: [0xfd,0x55] + # CHECK: i32x4.trunc_sat_zero_f64x2_s # encoding: [0xfd,0xfc,0x01] i32x4.trunc_sat_zero_f64x2_s - # CHECK: i32x4.trunc_sat_zero_f64x2_u # encoding: [0xfd,0x56] + # CHECK: i32x4.trunc_sat_zero_f64x2_u # encoding: [0xfd,0xfd,0x01] i32x4.trunc_sat_zero_f64x2_u - # CHECK: f32x4.demote_zero_f64x2 # encoding: [0xfd,0x57] - f32x4.demote_zero_f64x2 + # CHECK: f64x2.convert_low_i32x4_s # encoding: [0xfd,0xfe,0x01] + f64x2.convert_low_i32x4_s - # CHECK: f64x2.promote_low_f32x4 # encoding: [0xfd,0x69] - f64x2.promote_low_f32x4 + # CHECK: f64x2.convert_low_i32x4_u # encoding: [0xfd,0xff,0x01] + f64x2.convert_low_i32x4_u end_function -- GitLab From 6b053c9867a3ede32e51cef3ed972d5ce5b38bc0 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Thu, 18 Mar 2021 11:32:34 -0700 Subject: [PATCH 0072/1000] [VPlan] Add plain text (not DOT's digraph) dumps I foresee two uses for this: 1) It's easier to use those in debugger. 2) Once we start implementing more VPlan-to-VPlan transformations (especially inner loop massaging stuff), using the vectorized LLVM IR as CHECK targets in LIT test would become too obscure. I can imagine that we'd want to CHECK against VPlan dumps after multiple transformations instead. That would be easier with plain text dumps than with DOT format. Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D96628 --- .../Vectorize/LoopVectorizationPlanner.h | 5 +- .../Transforms/Vectorize/LoopVectorize.cpp | 16 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 139 ++++++++++++------ llvm/lib/Transforms/Vectorize/VPlan.h | 73 ++++++--- .../Transforms/LoopVectorize/icmp-uniforms.ll | 13 +- .../LoopVectorize/vplan-dot-printing.ll | 40 +++++ .../LoopVectorize/vplan-printing.ll | 129 ++++++++-------- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 30 ++-- .../Transforms/Vectorize/VPlanTest.cpp | 43 +++++- 9 files changed, 327 insertions(+), 161 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 1f8d5c8aa195..fae75e318b42 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -256,10 +256,7 @@ public: /// best selected VPlan. void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); - void printPlans(raw_ostream &O) { - for (const auto &Plan : VPlans) - O << *Plan; - } + void printPlans(raw_ostream &O); /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6e310fb1ba95..61b6fa1bcc63 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -360,6 +360,10 @@ cl::opt llvm::EnableLoopVectorization( "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); +cl::opt PrintVPlansInDotFormat( + "vplan-print-in-dot-format", cl::init(false), cl::Hidden, + cl::desc("Use dot format instead of plain text when dumping VPlans")); + /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa(I) || isa(I)) && @@ -7809,6 +7813,14 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, ILV.printDebugTracesAtEnd(); } +void LoopVectorizationPlanner::printPlans(raw_ostream &O) { + for (const auto &Plan : VPlans) + if (PrintVPlansInDotFormat) + Plan->printDOT(O); + else + Plan->print(O); +} + void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl &DeadInstructions) { @@ -9007,7 +9019,7 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; getAddr()->printAsOperand(O, SlotTracker); @@ -9018,7 +9030,7 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) - O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; + O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; } void VPWidenCallRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 6974502bad70..9e669fa2c82f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -399,6 +399,42 @@ void VPBasicBlock::dropAllReferences(VPValue *NewValue) { } } +void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << getName() << ":\n"; + if (const VPValue *Pred = getPredicate()) { + O << Indent << "BlockPredicate:"; + Pred->printAsOperand(O, SlotTracker); + if (const auto *PredInst = dyn_cast(Pred)) + O << " (" << PredInst->getParent()->getName() << ")"; + O << '\n'; + } + + auto RecipeIndent = Indent + " "; + for (const VPRecipeBase &Recipe : *this) { + Recipe.print(O, RecipeIndent, SlotTracker); + O << '\n'; + } + + if (getSuccessors().empty()) { + O << Indent << "No successors\n"; + } else { + O << Indent << "Successor(s): "; + ListSeparator LS; + for (auto *Succ : getSuccessors()) + O << LS << Succ->getName(); + O << '\n'; + } + + if (const VPValue *CBV = getCondBit()) { + O << Indent << "CondBit: "; + CBV->printAsOperand(O, SlotTracker); + if (const auto *CBI = dyn_cast(CBV)) + O << " (" << CBI->getParent()->getName() << ")"; + O << '\n'; + } +} + void VPRegionBlock::dropAllReferences(VPValue *NewValue) { for (VPBlockBase *Block : depth_first(Entry)) // Drop all references in VPBasicBlocks and replace all uses with @@ -455,6 +491,17 @@ void VPRegionBlock::execute(VPTransformState *State) { State->Instance.reset(); } +void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << (isReplicator() ? " " : " ") << getName() << ": {"; + auto NewIndent = Indent + " "; + for (auto *BlockBase : depth_first(Entry)) { + O << '\n'; + BlockBase->print(O, NewIndent, SlotTracker); + } + O << Indent << "}\n"; +} + void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && @@ -685,7 +732,25 @@ void VPlan::execute(VPTransformState *State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD -void VPlan::dump() const { dbgs() << *this << '\n'; } +void VPlan::print(raw_ostream &O) const { + VPSlotTracker SlotTracker(this); + + O << "VPlan {"; + for (const VPBlockBase *Block : depth_first(getEntry())) { + O << '\n'; + Block->print(O, "", SlotTracker); + } + O << "}\n"; +} + +LLVM_DUMP_METHOD +void VPlan::printDOT(raw_ostream &O) const { + VPlanPrinter Printer(O, *this); + Printer.dump(); +} + +LLVM_DUMP_METHOD +void VPlan::dump() const { print(dbgs()); } #endif void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, @@ -804,46 +869,32 @@ void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { } void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { + // Implement dot-formatted dump by performing plain-text dump into the + // temporary storage followed by some post-processing. OS << Indent << getUID(BasicBlock) << " [label =\n"; bumpIndent(1); - OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; - bumpIndent(1); + std::string Str; + raw_string_ostream SS(Str); + // Use no indentation as we need to wrap the lines into quotes ourselves. + BasicBlock->print(SS, "", SlotTracker); - // Dump the block predicate. - const VPValue *Pred = BasicBlock->getPredicate(); - if (Pred) { - OS << " +\n" << Indent << " \"BlockPredicate: \""; - if (const VPInstruction *PredI = dyn_cast(Pred)) { - PredI->printAsOperand(OS, SlotTracker); - OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) - << ")\\l\""; - } else - Pred->printAsOperand(OS, SlotTracker); - } + // We need to process each line of the output separately, so split + // single-string plain-text dump. + SmallVector Lines; + StringRef(Str).rtrim('\n').split(Lines, "\n"); - for (const VPRecipeBase &Recipe : *BasicBlock) { - OS << " +\n" << Indent << "\""; - // Don't indent inside the recipe printer as we printed it before the - // opening quote already. - Recipe.print(OS, "", SlotTracker); - OS << "\\l\""; - } + auto EmitLine = [&](StringRef Line, StringRef Suffix) { + OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix; + }; - // Dump the condition bit. - const VPValue *CBV = BasicBlock->getCondBit(); - if (CBV) { - OS << " +\n" << Indent << " \"CondBit: "; - if (const VPInstruction *CBI = dyn_cast(CBV)) { - CBI->printAsOperand(OS, SlotTracker); - OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; - } else { - CBV->printAsOperand(OS, SlotTracker); - OS << "\""; - } - } + // Don't need the "+" after the last line. + for (auto Line : make_range(Lines.begin(), Lines.end() - 1)) + EmitLine(Line, " +\n"); + EmitLine(Lines.back(), "\n"); + + bumpIndent(-1); + OS << Indent << "]\n"; - bumpIndent(-2); - OS << "\n" << Indent << "]\n"; dumpEdges(BasicBlock); } @@ -863,25 +914,21 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { dumpEdges(Region); } -void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { - std::string IngredientString; - raw_string_ostream RSO(IngredientString); +void VPlanIngredient::print(raw_ostream &O) const { if (auto *Inst = dyn_cast(V)) { if (!Inst->getType()->isVoidTy()) { - Inst->printAsOperand(RSO, false); - RSO << " = "; + Inst->printAsOperand(O, false); + O << " = "; } - RSO << Inst->getOpcodeName() << " "; + O << Inst->getOpcodeName() << " "; unsigned E = Inst->getNumOperands(); if (E > 0) { - Inst->getOperand(0)->printAsOperand(RSO, false); + Inst->getOperand(0)->printAsOperand(O, false); for (unsigned I = 1; I < E; ++I) - Inst->getOperand(I)->printAsOperand(RSO << ", ", false); + Inst->getOperand(I)->printAsOperand(O << ", ", false); } } else // !Inst - V->printAsOperand(RSO, false); - RSO.flush(); - O << DOT::EscapeString(IngredientString); + V->printAsOperand(O, false); } void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9b5d5d7e77be..5a98c63401b0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -577,12 +577,6 @@ public: OS << getName(); } - void print(raw_ostream &OS) const { - // TODO: Only printing VPBB name for now since we only have dot printing - // support for VPInstructions/Recipes. - printAsOperand(OS, false); - } - /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -593,6 +587,24 @@ public: /// Replace all operands of VPUsers in the block with \p NewValue and also /// replaces all uses of VPValues defined in the block with NewValue. virtual void dropAllReferences(VPValue *NewValue) = 0; + + /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines + /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using + /// consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual blocks is consistent with the whole VPlan printing. + virtual void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const = 0; + + /// Print plain-text dump of this VPlan to \p O. + void print(raw_ostream &O) const { + VPSlotTracker SlotTracker(getPlan()); + print(O, "", SlotTracker); + } + + /// Dump this VPBlockBase to dbgs(). + void dump() const { print(dbgs()); } }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR @@ -1246,12 +1258,11 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override { - O << " +\n" << Indent << "\"BRANCH-ON-MASK "; + O << Indent << "BRANCH-ON-MASK "; if (VPValue *Mask = getMask()) Mask->printAsOperand(O, SlotTracker); else O << " All-One"; - O << "\\l\""; } /// Return the mask used by this recipe. Note that a full mask is represented @@ -1463,6 +1474,15 @@ public: void dropAllReferences(VPValue *NewValue) override; + /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p + /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual blocks is consistent with the whole VPlan printing. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + using VPBlockBase::print; // Get the print(raw_stream &O) version. + private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -1554,6 +1574,16 @@ public: void execute(struct VPTransformState *State) override; void dropAllReferences(VPValue *NewValue) override; + + /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with + /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using + /// consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual regions is consistent with the whole VPlan printing. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + using VPBlockBase::print; // Get the print(raw_stream &O) version. }; //===----------------------------------------------------------------------===// @@ -1806,6 +1836,12 @@ public: VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } + /// Print this VPlan to \p O. + void print(raw_ostream &O) const; + + /// Print this VPlan in DOT format to \p O. + void printDOT(raw_ostream &O) const; + /// Dump the plan to stderr (for debugging). void dump() const; @@ -1830,11 +1866,6 @@ private: /// VPlanPrinter prints a given VPlan to a given output stream. The printing is /// indented and follows the dot format. class VPlanPrinter { - friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan); - friend inline raw_ostream &operator<<(raw_ostream &OS, - const struct VPlanIngredient &I); - -private: raw_ostream &OS; const VPlan &Plan; unsigned Depth = 0; @@ -1845,9 +1876,6 @@ private: VPSlotTracker SlotTracker; - VPlanPrinter(raw_ostream &O, const VPlan &P) - : OS(O), Plan(P), SlotTracker(&P) {} - /// Handle indentation. void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } @@ -1877,25 +1905,28 @@ private: void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, const Twine &Label); - void dump(); +public: + VPlanPrinter(raw_ostream &O, const VPlan &P) + : OS(O), Plan(P), SlotTracker(&P) {} - static void printAsIngredient(raw_ostream &O, const Value *V); + void dump(); }; struct VPlanIngredient { const Value *V; VPlanIngredient(const Value *V) : V(V) {} + + void print(raw_ostream &O) const; }; inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { - VPlanPrinter::printAsIngredient(OS, I.V); + I.print(OS); return OS; } inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { - VPlanPrinter Printer(OS, Plan); - Printer.dump(); + Plan.print(OS); return OS; } diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll index 6aa385d1df8d..181a7d70da82 100644 --- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -36,12 +36,13 @@ for.end: } ; Check for crash exposed by D76992. -; CHECK: N0 [label = -; CHECK-NEXT: "loop:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi 0, %iv.next\l" + -; CHECK-NEXT: "WIDEN ir<%cond0> = icmp ir<%iv>, ir<13>\l" + -; CHECK-NEXT: "WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20>\l" -; CHECK-NEXT: ] +; CHECK: VPlan { +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: WIDEN ir<%cond0> = icmp ir<%iv>, ir<13> +; CHECK-NEXT: WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20> +; CHECK-NEXT: No successor +; CHECK-NEXT: } define void @test() { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll new file mode 100644 index 000000000000..7d8d18dcfdaa --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -0,0 +1,40 @@ +; REQUIRES: asserts + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -vplan-print-in-dot-format -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; Verify that -vplan-print-in-dot-format option works. + +define void @print_call_and_memory(i64 %n, float* noalias %y, float* noalias %x) nounwind uwtable { +; CHECK: N0 [label = +; CHECK-NEXT: "for.body:\l" + +; CHECK-NEXT: " WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + +; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr ir\<%y\>, ir\<%iv\>\l" + +; CHECK-NEXT: " WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" + +; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>)\l" + +; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr ir\<%x\>, ir\<%iv\>\l" + +; CHECK-NEXT: " WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" + +; CHECK-NEXT: "No successors\l" +; CHECK-NEXT: ] +; +entry: + %cmp6 = icmp sgt i64 %n, 0 + br i1 %cmp6, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %y, i64 %iv + %lv = load float, float* %arrayidx, align 4 + %call = tail call float @llvm.sqrt.f32(float %lv) nounwind readnone + %arrayidx2 = getelementptr inbounds float, float* %x, i64 %iv + store float %call, float* %arrayidx2, align 4 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare float @llvm.sqrt.f32(float) nounwind readnone diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 1f649f3dc206..93718ffbeab9 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -7,16 +7,17 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Tests for printing VPlans. define void @print_call_and_memory(i64 %n, float* noalias %y, float* noalias %x) nounwind uwtable { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + -; CHECK-NEXT: "CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + -; CHECK-NEXT: "WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>)\l" + -; CHECK-NEXT: "CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN store ir<%arrayidx2>, ir<%call>\l" -; CHECK-NEXT: ] - +; CHECK: VPlan { +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> +; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>) +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv> +; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%call> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -37,18 +38,19 @@ for.end: ; preds = %for.body, %entry } define void @print_widen_gep_and_select(i64 %n, float* noalias %y, float* noalias %x, float* %z) nounwind uwtable { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + -; CHECK-NEXT: "WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + -; CHECK-NEXT: "WIDEN ir<%cmp> = icmp ir<%arrayidx>, ir<%z>\l" + -; CHECK-NEXT: "WIDEN-SELECT ir<%sel> = select ir<%cmp>, ir<1.000000e+01>, ir<2.000000e+01>\l" + -; CHECK-NEXT: "WIDEN ir<%add> = fadd ir<%lv>, ir<%sel>\l" + -; CHECK-NEXT: "CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN store ir<%arrayidx2>, ir<%add>\l" -; CHECK-NEXT: ] - +; CHECK: VPlan { +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 +; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> +; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%arrayidx>, ir<%z> +; CHECK-NEXT: WIDEN-SELECT ir<%sel> = select ir<%cmp>, ir<1.000000e+01>, ir<2.000000e+01> +; CHECK-NEXT: WIDEN ir<%add> = fadd ir<%lv>, ir<%sel> +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv> +; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%add> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -71,15 +73,16 @@ for.end: ; preds = %for.body, %entry } define float @print_reduction(i64 %n, float* noalias %y) { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + -; CHECK-NEXT: "WIDEN-PHI %red = phi %red.next, 0.000000e+00\l" + -; CHECK-NEXT: "CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + -; CHECK-NEXT: "REDUCE ir<%red.next> = ir<%red> + reduce.fadd (ir<%lv>)\l" -; CHECK-NEXT: ] - +; CHECK: VPlan { +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 +; CHECK-NEXT: WIDEN-PHI %red = phi %red.next, 0.000000e+00 +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> +; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.fadd (ir<%lv>) +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; entry: br label %for.body @@ -98,36 +101,40 @@ for.end: ; preds = %for.body, %entry } define void @print_replicate_predicated_phi(i64 %n, i64* %x) { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %i = phi 0, %i.next\l" + -; CHECK-NEXT: "WIDEN ir<%cmp> = icmp ir<%i>, ir<5>\l" -; CHECK-NEXT: ] -; -; CHECK: N2 [label = -; CHECK-NEXT: "pred.udiv.entry:\n" + -; CHECK-NEXT: + -; CHECK-NEXT: "BRANCH-ON-MASK ir<%cmp>\l"\l -; CHECK-NEXT: "CondBit: ir<%cmp>" -; CHECK-NEXT: ] -; -; CHECK: N4 [label = -; CHECK-NEXT: "pred.udiv.if:\n" + -; CHECK-NEXT: "REPLICATE ir<%tmp4> = udiv ir<%n>, ir<%i> (S->V)\l" -; CHECK-NEXT: ] -; -; CHECK: N5 [label = -; CHECK-NEXT: "pred.udiv.continue:\n" + -; CHECK-NEXT: "PHI-PREDICATED-INSTRUCTION vp<%3> = ir<%tmp4>\l" -; CHECK-NEXT: ] -; -; CHECK: N7 [label = -; CHECK-NEXT: "for.inc:\n" + -; CHECK-NEXT: "EMIT vp<%4> = not ir<%cmp>\l" + -; CHECK-NEXT: "BLEND %d = ir<0>/vp<%4> vp<%3>/ir<%cmp>\l" + -; CHECK-NEXT: "CLONE ir<%idx> = getelementptr ir<%x>, ir<%i>\l" + -; CHECK-NEXT: "WIDEN store ir<%idx>, ir<%d>\l" -; CHECK-NEXT: ] +; CHECK: VPlan { +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-INDUCTION %i = phi 0, %i.next +; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%i>, ir<5> +; CHECK-NEXT: Successor(s): if.then +; CHECK-EMPTY: +; CHECK-NEXT: if.then: +; CHECK-NEXT: Successor(s): pred.udiv +; CHECK-EMPTY: +; CHECK-NEXT: pred.udiv: { +; CHECK-NEXT: pred.udiv.entry: +; CHECK-NEXT: BRANCH-ON-MASK ir<%cmp> +; CHECK-NEXT: Successor(s): pred.udiv.if, pred.udiv.continue +; CHECK-NEXT: CondBit: ir<%cmp> +; CHECK-EMPTY: +; CHECK-NEXT: pred.udiv.if: +; CHECK-NEXT: REPLICATE ir<%tmp4> = udiv ir<%n>, ir<%i> (S->V) +; CHECK-NEXT: Successor(s): pred.udiv.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.udiv.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%3> = ir<%tmp4> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-EMPTY: +; CHECK-NEXT: if.then.0: +; CHECK-NEXT: Successor(s): for.inc +; CHECK-EMPTY: +; CHECK-NEXT: for.inc: +; CHECK-NEXT: EMIT vp<%4> = not ir<%cmp> +; CHECK-NEXT: BLEND %d = ir<0>/vp<%4> vp<%3>/ir<%cmp> +; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, ir<%i> +; CHECK-NEXT: WIDEN store ir<%idx>, ir<%d> +; CHECK-NEXT: No successors +; CHECK-NEXT: } ; entry: br label %for.body diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 880b8f711462..cf314043f011 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -93,7 +93,8 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { // as this is not required with the new printing. Plan->addVPValue(&*F->arg_begin()); std::string FullDump; - raw_string_ostream(FullDump) << *Plan; + raw_string_ostream OS(FullDump); + Plan->printDOT(OS); const char *ExpectedStr = R"(digraph VPlan { graph [labelloc=t, fontsize=30; label="Vectorization Plan"] node [shape=rect, fontname=Courier, fontsize=30] @@ -103,25 +104,28 @@ compound=true fontname=Courier label="\ TopRegion" N1 [label = - "entry:\n" + "entry:\l" + + "Successor(s): for.body\l" ] N1 -> N2 [ label=""] N2 [label = - "for.body:\n" + - "WIDEN-PHI %indvars.iv = phi 0, %indvars.iv.next\l" + - "EMIT ir<%arr.idx> = getelementptr ir<%A> ir<%indvars.iv>\l" + - "EMIT ir<%l1> = load ir<%arr.idx>\l" + - "EMIT ir<%res> = add ir<%l1> ir<10>\l" + - "EMIT store ir<%res> ir<%arr.idx>\l" + - "EMIT ir<%indvars.iv.next> = add ir<%indvars.iv> ir<1>\l" + - "EMIT ir<%exitcond> = icmp ir<%indvars.iv.next> ir<%N>\l" + - "CondBit: ir<%exitcond> (for.body)\l" + "for.body:\l" + + " WIDEN-PHI %indvars.iv = phi 0, %indvars.iv.next\l" + + " EMIT ir\<%arr.idx\> = getelementptr ir\<%A\> ir\<%indvars.iv\>\l" + + " EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" + + " EMIT ir\<%res\> = add ir\<%l1\> ir\<10\>\l" + + " EMIT store ir\<%res\> ir\<%arr.idx\>\l" + + " EMIT ir\<%indvars.iv.next\> = add ir\<%indvars.iv\> ir\<1\>\l" + + " EMIT ir\<%exitcond\> = icmp ir\<%indvars.iv.next\> ir\<%N\>\l" + + "Successor(s): for.body, for.end\l" + + "CondBit: ir\<%exitcond\> (for.body)\l" ] N2 -> N2 [ label="T"] N2 -> N3 [ label="F"] N3 [label = - "for.end:\n" + - "EMIT ret\l" + "for.end:\l" + + " EMIT ret\l" + + "No successors\l" ] } } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index f8f1562d548c..71f27f95bad7 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -333,12 +333,14 @@ TEST(VPBasicBlockTest, print) { VPBB1->appendRecipe(I1); VPBB1->appendRecipe(I2); VPBB1->appendRecipe(I3); + VPBB1->setName("bb1"); VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1}); VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4}); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB2->appendRecipe(I4); VPBB2->appendRecipe(I5); + VPBB2->setName("bb2"); VPBlockUtils::connectBlocks(VPBB1, VPBB2); @@ -355,7 +357,8 @@ TEST(VPBasicBlockTest, print) { VPlan Plan; Plan.setEntry(VPBB1); std::string FullDump; - raw_string_ostream(FullDump) << Plan; + raw_string_ostream OS(FullDump); + Plan.printDOT(OS); const char *ExpectedStr = R"(digraph VPlan { graph [labelloc=t, fontsize=30; label="Vectorization Plan"] @@ -363,21 +366,45 @@ node [shape=rect, fontname=Courier, fontsize=30] edge [fontname=Courier, fontsize=30] compound=true N0 [label = - ":\n" + - "EMIT vp<%0> = add\l" + - "EMIT vp<%1> = sub vp<%0>\l" + - "EMIT br vp<%0> vp<%1>\l" + "bb1:\l" + + " EMIT vp\<%0\> = add\l" + + " EMIT vp\<%1\> = sub vp\<%0\>\l" + + " EMIT br vp\<%0\> vp\<%1\>\l" + + "Successor(s): bb2\l" ] N0 -> N1 [ label=""] N1 [label = - ":\n" + - "EMIT vp<%3> = mul vp<%1> vp<%0>\l" + - "EMIT ret vp<%3>\l" + "bb2:\l" + + " EMIT vp\<%3\> = mul vp\<%1\> vp\<%0\>\l" + + " EMIT ret vp\<%3\>\l" + + "No successors\l" ] } )"; EXPECT_EQ(ExpectedStr, FullDump); + const char *ExpectedBlock1Str = R"(bb1: + EMIT vp<%0> = add + EMIT vp<%1> = sub vp<%0> + EMIT br vp<%0> vp<%1> +Successor(s): bb2 +)"; + std::string Block1Dump; + raw_string_ostream OS1(Block1Dump); + VPBB1->print(OS1); + EXPECT_EQ(ExpectedBlock1Str, Block1Dump); + + // Ensure that numbering is good when dumping the second block in isolation. + const char *ExpectedBlock2Str = R"(bb2: + EMIT vp<%3> = mul vp<%1> vp<%0> + EMIT ret vp<%3> +No successors +)"; + std::string Block2Dump; + raw_string_ostream OS2(Block2Dump); + VPBB2->print(OS2); + EXPECT_EQ(ExpectedBlock2Str, Block2Dump); + { std::string I3Dump; raw_string_ostream OS(I3Dump); -- GitLab From f6af5efcec4171080c036ad55a2b4db9fc5c37fa Mon Sep 17 00:00:00 2001 From: Muiez Ahmed Date: Thu, 18 Mar 2021 14:23:55 -0400 Subject: [PATCH 0073/1000] [SystemZ][z/OS] vasprintf fix libc++ The aim is to use the correct vasprintf implementation for z/OS libc++, where a copy of va_list ap is needed. In particular, it avoids the potential that the initial internal call to vsnprintf will modify ap and the subsequent call to vsnprintf will use that modified ap. Differential Revision: https://reviews.llvm.org/D97473 --- libcxx/include/__support/ibm/xlocale.h | 20 +++++++++++--------- libcxx/src/support/win32/support.cpp | 5 ++++- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/libcxx/include/__support/ibm/xlocale.h b/libcxx/include/__support/ibm/xlocale.h index b4d21172bcfa..563b465a8f65 100644 --- a/libcxx/include/__support/ibm/xlocale.h +++ b/libcxx/include/__support/ibm/xlocale.h @@ -10,6 +10,7 @@ #ifndef _LIBCPP_SUPPORT_IBM_XLOCALE_H #define _LIBCPP_SUPPORT_IBM_XLOCALE_H +#include #include <__support/ibm/locale_mgmt_aix.h> #include <__support/ibm/locale_mgmt_zos.h> @@ -268,18 +269,19 @@ unsigned long strtoul_l(const char *__nptr, char **__endptr, } static inline -int vasprintf(char **strp, const char *fmt, va_list ap) -{ +int vasprintf(char **strp, const char *fmt, va_list ap) { const size_t buff_size = 256; - int str_size; - if ((*strp = (char *)malloc(buff_size)) == NULL) - { + if ((*strp = (char *)malloc(buff_size)) == NULL) { return -1; } - if ((str_size = vsnprintf(*strp, buff_size, fmt, ap)) >= buff_size) - { - if ((*strp = (char *)realloc(*strp, str_size + 1)) == NULL) - { + + va_list ap_copy; + va_copy(ap_copy, ap); + int str_size = vsnprintf(*strp, buff_size, fmt, ap_copy); + va_end(ap_copy); + + if ((size_t) str_size >= buff_size) { + if ((*strp = (char *)realloc(*strp, str_size + 1)) == NULL) { return -1; } str_size = vsnprintf(*strp, str_size + 1, fmt, ap); diff --git a/libcxx/src/support/win32/support.cpp b/libcxx/src/support/win32/support.cpp index 52453f547926..5890e669a34e 100644 --- a/libcxx/src/support/win32/support.cpp +++ b/libcxx/src/support/win32/support.cpp @@ -22,7 +22,10 @@ int __libcpp_vasprintf( char **sptr, const char *__restrict format, va_list ap ) { *sptr = NULL; // Query the count required. - int count = _vsnprintf( NULL, 0, format, ap ); + va_list ap_copy; + va_copy(ap_copy, ap); + int count = _vsnprintf( NULL, 0, format, ap_copy ); + va_end(ap_copy); if (count < 0) return count; size_t buffer_size = static_cast(count) + 1; -- GitLab From 3614df3537f9d699fe0835baf6fc0ddd5c9d699d Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 18 Mar 2021 19:20:39 +0000 Subject: [PATCH 0074/1000] Revert "[VPlan] Add plain text (not DOT's digraph) dumps" This reverts commit 6b053c9867a3ede32e51cef3ed972d5ce5b38bc0. The build is broken: ld.lld: error: undefined symbol: llvm::VPlan::printDOT(llvm::raw_ostream&) const >>> referenced by LoopVectorize.cpp >>> LoopVectorize.cpp.o:(llvm::LoopVectorizationPlanner::printPlans(llvm::raw_ostream&)) in archive lib/libLLVMVectorize.a --- .../Vectorize/LoopVectorizationPlanner.h | 5 +- .../Transforms/Vectorize/LoopVectorize.cpp | 16 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 139 ++++++------------ llvm/lib/Transforms/Vectorize/VPlan.h | 73 +++------ .../Transforms/LoopVectorize/icmp-uniforms.ll | 13 +- .../LoopVectorize/vplan-dot-printing.ll | 40 ----- .../LoopVectorize/vplan-printing.ll | 129 ++++++++-------- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 30 ++-- .../Transforms/Vectorize/VPlanTest.cpp | 43 +----- 9 files changed, 161 insertions(+), 327 deletions(-) delete mode 100644 llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index fae75e318b42..1f8d5c8aa195 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -256,7 +256,10 @@ public: /// best selected VPlan. void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); - void printPlans(raw_ostream &O); + void printPlans(raw_ostream &O) { + for (const auto &Plan : VPlans) + O << *Plan; + } /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 61b6fa1bcc63..6e310fb1ba95 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -360,10 +360,6 @@ cl::opt llvm::EnableLoopVectorization( "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); -cl::opt PrintVPlansInDotFormat( - "vplan-print-in-dot-format", cl::init(false), cl::Hidden, - cl::desc("Use dot format instead of plain text when dumping VPlans")); - /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa(I) || isa(I)) && @@ -7813,14 +7809,6 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, ILV.printDebugTracesAtEnd(); } -void LoopVectorizationPlanner::printPlans(raw_ostream &O) { - for (const auto &Plan : VPlans) - if (PrintVPlansInDotFormat) - Plan->printDOT(O); - else - Plan->print(O); -} - void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl &DeadInstructions) { @@ -9019,7 +9007,7 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + O << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; getAddr()->printAsOperand(O, SlotTracker); @@ -9030,7 +9018,7 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) - O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; + O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; } void VPWidenCallRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 9e669fa2c82f..6974502bad70 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -399,42 +399,6 @@ void VPBasicBlock::dropAllReferences(VPValue *NewValue) { } } -void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << getName() << ":\n"; - if (const VPValue *Pred = getPredicate()) { - O << Indent << "BlockPredicate:"; - Pred->printAsOperand(O, SlotTracker); - if (const auto *PredInst = dyn_cast(Pred)) - O << " (" << PredInst->getParent()->getName() << ")"; - O << '\n'; - } - - auto RecipeIndent = Indent + " "; - for (const VPRecipeBase &Recipe : *this) { - Recipe.print(O, RecipeIndent, SlotTracker); - O << '\n'; - } - - if (getSuccessors().empty()) { - O << Indent << "No successors\n"; - } else { - O << Indent << "Successor(s): "; - ListSeparator LS; - for (auto *Succ : getSuccessors()) - O << LS << Succ->getName(); - O << '\n'; - } - - if (const VPValue *CBV = getCondBit()) { - O << Indent << "CondBit: "; - CBV->printAsOperand(O, SlotTracker); - if (const auto *CBI = dyn_cast(CBV)) - O << " (" << CBI->getParent()->getName() << ")"; - O << '\n'; - } -} - void VPRegionBlock::dropAllReferences(VPValue *NewValue) { for (VPBlockBase *Block : depth_first(Entry)) // Drop all references in VPBasicBlocks and replace all uses with @@ -491,17 +455,6 @@ void VPRegionBlock::execute(VPTransformState *State) { State->Instance.reset(); } -void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const { - O << Indent << (isReplicator() ? " " : " ") << getName() << ": {"; - auto NewIndent = Indent + " "; - for (auto *BlockBase : depth_first(Entry)) { - O << '\n'; - BlockBase->print(O, NewIndent, SlotTracker); - } - O << Indent << "}\n"; -} - void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && @@ -732,25 +685,7 @@ void VPlan::execute(VPTransformState *State) { #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD -void VPlan::print(raw_ostream &O) const { - VPSlotTracker SlotTracker(this); - - O << "VPlan {"; - for (const VPBlockBase *Block : depth_first(getEntry())) { - O << '\n'; - Block->print(O, "", SlotTracker); - } - O << "}\n"; -} - -LLVM_DUMP_METHOD -void VPlan::printDOT(raw_ostream &O) const { - VPlanPrinter Printer(O, *this); - Printer.dump(); -} - -LLVM_DUMP_METHOD -void VPlan::dump() const { print(dbgs()); } +void VPlan::dump() const { dbgs() << *this << '\n'; } #endif void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, @@ -869,32 +804,46 @@ void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { } void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { - // Implement dot-formatted dump by performing plain-text dump into the - // temporary storage followed by some post-processing. OS << Indent << getUID(BasicBlock) << " [label =\n"; bumpIndent(1); - std::string Str; - raw_string_ostream SS(Str); - // Use no indentation as we need to wrap the lines into quotes ourselves. - BasicBlock->print(SS, "", SlotTracker); - - // We need to process each line of the output separately, so split - // single-string plain-text dump. - SmallVector Lines; - StringRef(Str).rtrim('\n').split(Lines, "\n"); + OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; + bumpIndent(1); - auto EmitLine = [&](StringRef Line, StringRef Suffix) { - OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix; - }; + // Dump the block predicate. + const VPValue *Pred = BasicBlock->getPredicate(); + if (Pred) { + OS << " +\n" << Indent << " \"BlockPredicate: \""; + if (const VPInstruction *PredI = dyn_cast(Pred)) { + PredI->printAsOperand(OS, SlotTracker); + OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) + << ")\\l\""; + } else + Pred->printAsOperand(OS, SlotTracker); + } - // Don't need the "+" after the last line. - for (auto Line : make_range(Lines.begin(), Lines.end() - 1)) - EmitLine(Line, " +\n"); - EmitLine(Lines.back(), "\n"); + for (const VPRecipeBase &Recipe : *BasicBlock) { + OS << " +\n" << Indent << "\""; + // Don't indent inside the recipe printer as we printed it before the + // opening quote already. + Recipe.print(OS, "", SlotTracker); + OS << "\\l\""; + } - bumpIndent(-1); - OS << Indent << "]\n"; + // Dump the condition bit. + const VPValue *CBV = BasicBlock->getCondBit(); + if (CBV) { + OS << " +\n" << Indent << " \"CondBit: "; + if (const VPInstruction *CBI = dyn_cast(CBV)) { + CBI->printAsOperand(OS, SlotTracker); + OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; + } else { + CBV->printAsOperand(OS, SlotTracker); + OS << "\""; + } + } + bumpIndent(-2); + OS << "\n" << Indent << "]\n"; dumpEdges(BasicBlock); } @@ -914,21 +863,25 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { dumpEdges(Region); } -void VPlanIngredient::print(raw_ostream &O) const { +void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { + std::string IngredientString; + raw_string_ostream RSO(IngredientString); if (auto *Inst = dyn_cast(V)) { if (!Inst->getType()->isVoidTy()) { - Inst->printAsOperand(O, false); - O << " = "; + Inst->printAsOperand(RSO, false); + RSO << " = "; } - O << Inst->getOpcodeName() << " "; + RSO << Inst->getOpcodeName() << " "; unsigned E = Inst->getNumOperands(); if (E > 0) { - Inst->getOperand(0)->printAsOperand(O, false); + Inst->getOperand(0)->printAsOperand(RSO, false); for (unsigned I = 1; I < E; ++I) - Inst->getOperand(I)->printAsOperand(O << ", ", false); + Inst->getOperand(I)->printAsOperand(RSO << ", ", false); } } else // !Inst - V->printAsOperand(O, false); + V->printAsOperand(RSO, false); + RSO.flush(); + O << DOT::EscapeString(IngredientString); } void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5a98c63401b0..9b5d5d7e77be 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -577,6 +577,12 @@ public: OS << getName(); } + void print(raw_ostream &OS) const { + // TODO: Only printing VPBB name for now since we only have dot printing + // support for VPInstructions/Recipes. + printAsOperand(OS, false); + } + /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -587,24 +593,6 @@ public: /// Replace all operands of VPUsers in the block with \p NewValue and also /// replaces all uses of VPValues defined in the block with NewValue. virtual void dropAllReferences(VPValue *NewValue) = 0; - - /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines - /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using - /// consequtive numbers. - /// - /// Note that the numbering is applied to the whole VPlan, so printing - /// individual blocks is consistent with the whole VPlan printing. - virtual void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const = 0; - - /// Print plain-text dump of this VPlan to \p O. - void print(raw_ostream &O) const { - VPSlotTracker SlotTracker(getPlan()); - print(O, "", SlotTracker); - } - - /// Dump this VPBlockBase to dbgs(). - void dump() const { print(dbgs()); } }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR @@ -1258,11 +1246,12 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override { - O << Indent << "BRANCH-ON-MASK "; + O << " +\n" << Indent << "\"BRANCH-ON-MASK "; if (VPValue *Mask = getMask()) Mask->printAsOperand(O, SlotTracker); else O << " All-One"; + O << "\\l\""; } /// Return the mask used by this recipe. Note that a full mask is represented @@ -1474,15 +1463,6 @@ public: void dropAllReferences(VPValue *NewValue) override; - /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p - /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. - /// - /// Note that the numbering is applied to the whole VPlan, so printing - /// individual blocks is consistent with the whole VPlan printing. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; - using VPBlockBase::print; // Get the print(raw_stream &O) version. - private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -1574,16 +1554,6 @@ public: void execute(struct VPTransformState *State) override; void dropAllReferences(VPValue *NewValue) override; - - /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with - /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using - /// consequtive numbers. - /// - /// Note that the numbering is applied to the whole VPlan, so printing - /// individual regions is consistent with the whole VPlan printing. - void print(raw_ostream &O, const Twine &Indent, - VPSlotTracker &SlotTracker) const override; - using VPBlockBase::print; // Get the print(raw_stream &O) version. }; //===----------------------------------------------------------------------===// @@ -1836,12 +1806,6 @@ public: VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } - /// Print this VPlan to \p O. - void print(raw_ostream &O) const; - - /// Print this VPlan in DOT format to \p O. - void printDOT(raw_ostream &O) const; - /// Dump the plan to stderr (for debugging). void dump() const; @@ -1866,6 +1830,11 @@ private: /// VPlanPrinter prints a given VPlan to a given output stream. The printing is /// indented and follows the dot format. class VPlanPrinter { + friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan); + friend inline raw_ostream &operator<<(raw_ostream &OS, + const struct VPlanIngredient &I); + +private: raw_ostream &OS; const VPlan &Plan; unsigned Depth = 0; @@ -1876,6 +1845,9 @@ class VPlanPrinter { VPSlotTracker SlotTracker; + VPlanPrinter(raw_ostream &O, const VPlan &P) + : OS(O), Plan(P), SlotTracker(&P) {} + /// Handle indentation. void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } @@ -1905,28 +1877,25 @@ class VPlanPrinter { void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, const Twine &Label); -public: - VPlanPrinter(raw_ostream &O, const VPlan &P) - : OS(O), Plan(P), SlotTracker(&P) {} - void dump(); + + static void printAsIngredient(raw_ostream &O, const Value *V); }; struct VPlanIngredient { const Value *V; VPlanIngredient(const Value *V) : V(V) {} - - void print(raw_ostream &O) const; }; inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { - I.print(OS); + VPlanPrinter::printAsIngredient(OS, I.V); return OS; } inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { - Plan.print(OS); + VPlanPrinter Printer(OS, Plan); + Printer.dump(); return OS; } diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll index 181a7d70da82..6aa385d1df8d 100644 --- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -36,13 +36,12 @@ for.end: } ; Check for crash exposed by D76992. -; CHECK: VPlan { -; CHECK-NEXT: loop: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next -; CHECK-NEXT: WIDEN ir<%cond0> = icmp ir<%iv>, ir<13> -; CHECK-NEXT: WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20> -; CHECK-NEXT: No successor -; CHECK-NEXT: } +; CHECK: N0 [label = +; CHECK-NEXT: "loop:\n" + +; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi 0, %iv.next\l" + +; CHECK-NEXT: "WIDEN ir<%cond0> = icmp ir<%iv>, ir<13>\l" + +; CHECK-NEXT: "WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20>\l" +; CHECK-NEXT: ] define void @test() { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll deleted file mode 100644 index 7d8d18dcfdaa..000000000000 --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ /dev/null @@ -1,40 +0,0 @@ -; REQUIRES: asserts - -; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -vplan-print-in-dot-format -disable-output %s 2>&1 | FileCheck %s - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" - -; Verify that -vplan-print-in-dot-format option works. - -define void @print_call_and_memory(i64 %n, float* noalias %y, float* noalias %x) nounwind uwtable { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\l" + -; CHECK-NEXT: " WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + -; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr ir\<%y\>, ir\<%iv\>\l" + -; CHECK-NEXT: " WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" + -; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>)\l" + -; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr ir\<%x\>, ir\<%iv\>\l" + -; CHECK-NEXT: " WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" + -; CHECK-NEXT: "No successors\l" -; CHECK-NEXT: ] -; -entry: - %cmp6 = icmp sgt i64 %n, 0 - br i1 %cmp6, label %for.body, label %for.end - -for.body: ; preds = %entry, %for.body - %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] - %arrayidx = getelementptr inbounds float, float* %y, i64 %iv - %lv = load float, float* %arrayidx, align 4 - %call = tail call float @llvm.sqrt.f32(float %lv) nounwind readnone - %arrayidx2 = getelementptr inbounds float, float* %x, i64 %iv - store float %call, float* %arrayidx2, align 4 - %iv.next = add i64 %iv, 1 - %exitcond = icmp eq i64 %iv.next, %n - br i1 %exitcond, label %for.end, label %for.body - -for.end: ; preds = %for.body, %entry - ret void -} - -declare float @llvm.sqrt.f32(float) nounwind readnone diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 93718ffbeab9..1f649f3dc206 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -7,17 +7,16 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Tests for printing VPlans. define void @print_call_and_memory(i64 %n, float* noalias %y, float* noalias %x) nounwind uwtable { -; CHECK: VPlan { -; CHECK-NEXT: for.body: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> -; CHECK-NEXT: WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>) -; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv> -; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%call> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; +; CHECK: N0 [label = +; CHECK-NEXT: "for.body:\n" + +; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + +; CHECK-NEXT: "CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + +; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + +; CHECK-NEXT: "WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>)\l" + +; CHECK-NEXT: "CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv>\l" + +; CHECK-NEXT: "WIDEN store ir<%arrayidx2>, ir<%call>\l" +; CHECK-NEXT: ] + entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -38,19 +37,18 @@ for.end: ; preds = %for.body, %entry } define void @print_widen_gep_and_select(i64 %n, float* noalias %y, float* noalias %x, float* %z) nounwind uwtable { -; CHECK: VPlan { -; CHECK-NEXT: for.body: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 -; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> -; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%arrayidx>, ir<%z> -; CHECK-NEXT: WIDEN-SELECT ir<%sel> = select ir<%cmp>, ir<1.000000e+01>, ir<2.000000e+01> -; CHECK-NEXT: WIDEN ir<%add> = fadd ir<%lv>, ir<%sel> -; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv> -; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%add> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; +; CHECK: N0 [label = +; CHECK-NEXT: "for.body:\n" + +; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + +; CHECK-NEXT: "WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + +; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + +; CHECK-NEXT: "WIDEN ir<%cmp> = icmp ir<%arrayidx>, ir<%z>\l" + +; CHECK-NEXT: "WIDEN-SELECT ir<%sel> = select ir<%cmp>, ir<1.000000e+01>, ir<2.000000e+01>\l" + +; CHECK-NEXT: "WIDEN ir<%add> = fadd ir<%lv>, ir<%sel>\l" + +; CHECK-NEXT: "CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv>\l" + +; CHECK-NEXT: "WIDEN store ir<%arrayidx2>, ir<%add>\l" +; CHECK-NEXT: ] + entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -73,16 +71,15 @@ for.end: ; preds = %for.body, %entry } define float @print_reduction(i64 %n, float* noalias %y) { -; CHECK: VPlan { -; CHECK-NEXT: for.body: -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 -; CHECK-NEXT: WIDEN-PHI %red = phi %red.next, 0.000000e+00 -; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> -; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.fadd (ir<%lv>) -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; +; CHECK: N0 [label = +; CHECK-NEXT: "for.body:\n" + +; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + +; CHECK-NEXT: "WIDEN-PHI %red = phi %red.next, 0.000000e+00\l" + +; CHECK-NEXT: "CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + +; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + +; CHECK-NEXT: "REDUCE ir<%red.next> = ir<%red> + reduce.fadd (ir<%lv>)\l" +; CHECK-NEXT: ] + entry: br label %for.body @@ -101,40 +98,36 @@ for.end: ; preds = %for.body, %entry } define void @print_replicate_predicated_phi(i64 %n, i64* %x) { -; CHECK: VPlan { -; CHECK-NEXT: for.body: -; CHECK-NEXT: WIDEN-INDUCTION %i = phi 0, %i.next -; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%i>, ir<5> -; CHECK-NEXT: Successor(s): if.then -; CHECK-EMPTY: -; CHECK-NEXT: if.then: -; CHECK-NEXT: Successor(s): pred.udiv -; CHECK-EMPTY: -; CHECK-NEXT: pred.udiv: { -; CHECK-NEXT: pred.udiv.entry: -; CHECK-NEXT: BRANCH-ON-MASK ir<%cmp> -; CHECK-NEXT: Successor(s): pred.udiv.if, pred.udiv.continue -; CHECK-NEXT: CondBit: ir<%cmp> -; CHECK-EMPTY: -; CHECK-NEXT: pred.udiv.if: -; CHECK-NEXT: REPLICATE ir<%tmp4> = udiv ir<%n>, ir<%i> (S->V) -; CHECK-NEXT: Successor(s): pred.udiv.continue -; CHECK-EMPTY: -; CHECK-NEXT: pred.udiv.continue: -; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%3> = ir<%tmp4> -; CHECK-NEXT: No successors -; CHECK-NEXT: } -; CHECK-EMPTY: -; CHECK-NEXT: if.then.0: -; CHECK-NEXT: Successor(s): for.inc -; CHECK-EMPTY: -; CHECK-NEXT: for.inc: -; CHECK-NEXT: EMIT vp<%4> = not ir<%cmp> -; CHECK-NEXT: BLEND %d = ir<0>/vp<%4> vp<%3>/ir<%cmp> -; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, ir<%i> -; CHECK-NEXT: WIDEN store ir<%idx>, ir<%d> -; CHECK-NEXT: No successors -; CHECK-NEXT: } +; CHECK: N0 [label = +; CHECK-NEXT: "for.body:\n" + +; CHECK-NEXT: "WIDEN-INDUCTION %i = phi 0, %i.next\l" + +; CHECK-NEXT: "WIDEN ir<%cmp> = icmp ir<%i>, ir<5>\l" +; CHECK-NEXT: ] +; +; CHECK: N2 [label = +; CHECK-NEXT: "pred.udiv.entry:\n" + +; CHECK-NEXT: + +; CHECK-NEXT: "BRANCH-ON-MASK ir<%cmp>\l"\l +; CHECK-NEXT: "CondBit: ir<%cmp>" +; CHECK-NEXT: ] +; +; CHECK: N4 [label = +; CHECK-NEXT: "pred.udiv.if:\n" + +; CHECK-NEXT: "REPLICATE ir<%tmp4> = udiv ir<%n>, ir<%i> (S->V)\l" +; CHECK-NEXT: ] +; +; CHECK: N5 [label = +; CHECK-NEXT: "pred.udiv.continue:\n" + +; CHECK-NEXT: "PHI-PREDICATED-INSTRUCTION vp<%3> = ir<%tmp4>\l" +; CHECK-NEXT: ] +; +; CHECK: N7 [label = +; CHECK-NEXT: "for.inc:\n" + +; CHECK-NEXT: "EMIT vp<%4> = not ir<%cmp>\l" + +; CHECK-NEXT: "BLEND %d = ir<0>/vp<%4> vp<%3>/ir<%cmp>\l" + +; CHECK-NEXT: "CLONE ir<%idx> = getelementptr ir<%x>, ir<%i>\l" + +; CHECK-NEXT: "WIDEN store ir<%idx>, ir<%d>\l" +; CHECK-NEXT: ] ; entry: br label %for.body diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index cf314043f011..880b8f711462 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -93,8 +93,7 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { // as this is not required with the new printing. Plan->addVPValue(&*F->arg_begin()); std::string FullDump; - raw_string_ostream OS(FullDump); - Plan->printDOT(OS); + raw_string_ostream(FullDump) << *Plan; const char *ExpectedStr = R"(digraph VPlan { graph [labelloc=t, fontsize=30; label="Vectorization Plan"] node [shape=rect, fontname=Courier, fontsize=30] @@ -104,28 +103,25 @@ compound=true fontname=Courier label="\ TopRegion" N1 [label = - "entry:\l" + - "Successor(s): for.body\l" + "entry:\n" ] N1 -> N2 [ label=""] N2 [label = - "for.body:\l" + - " WIDEN-PHI %indvars.iv = phi 0, %indvars.iv.next\l" + - " EMIT ir\<%arr.idx\> = getelementptr ir\<%A\> ir\<%indvars.iv\>\l" + - " EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" + - " EMIT ir\<%res\> = add ir\<%l1\> ir\<10\>\l" + - " EMIT store ir\<%res\> ir\<%arr.idx\>\l" + - " EMIT ir\<%indvars.iv.next\> = add ir\<%indvars.iv\> ir\<1\>\l" + - " EMIT ir\<%exitcond\> = icmp ir\<%indvars.iv.next\> ir\<%N\>\l" + - "Successor(s): for.body, for.end\l" + - "CondBit: ir\<%exitcond\> (for.body)\l" + "for.body:\n" + + "WIDEN-PHI %indvars.iv = phi 0, %indvars.iv.next\l" + + "EMIT ir<%arr.idx> = getelementptr ir<%A> ir<%indvars.iv>\l" + + "EMIT ir<%l1> = load ir<%arr.idx>\l" + + "EMIT ir<%res> = add ir<%l1> ir<10>\l" + + "EMIT store ir<%res> ir<%arr.idx>\l" + + "EMIT ir<%indvars.iv.next> = add ir<%indvars.iv> ir<1>\l" + + "EMIT ir<%exitcond> = icmp ir<%indvars.iv.next> ir<%N>\l" + + "CondBit: ir<%exitcond> (for.body)\l" ] N2 -> N2 [ label="T"] N2 -> N3 [ label="F"] N3 [label = - "for.end:\l" + - " EMIT ret\l" + - "No successors\l" + "for.end:\n" + + "EMIT ret\l" ] } } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 71f27f95bad7..f8f1562d548c 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -333,14 +333,12 @@ TEST(VPBasicBlockTest, print) { VPBB1->appendRecipe(I1); VPBB1->appendRecipe(I2); VPBB1->appendRecipe(I3); - VPBB1->setName("bb1"); VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1}); VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4}); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB2->appendRecipe(I4); VPBB2->appendRecipe(I5); - VPBB2->setName("bb2"); VPBlockUtils::connectBlocks(VPBB1, VPBB2); @@ -357,8 +355,7 @@ TEST(VPBasicBlockTest, print) { VPlan Plan; Plan.setEntry(VPBB1); std::string FullDump; - raw_string_ostream OS(FullDump); - Plan.printDOT(OS); + raw_string_ostream(FullDump) << Plan; const char *ExpectedStr = R"(digraph VPlan { graph [labelloc=t, fontsize=30; label="Vectorization Plan"] @@ -366,45 +363,21 @@ node [shape=rect, fontname=Courier, fontsize=30] edge [fontname=Courier, fontsize=30] compound=true N0 [label = - "bb1:\l" + - " EMIT vp\<%0\> = add\l" + - " EMIT vp\<%1\> = sub vp\<%0\>\l" + - " EMIT br vp\<%0\> vp\<%1\>\l" + - "Successor(s): bb2\l" + ":\n" + + "EMIT vp<%0> = add\l" + + "EMIT vp<%1> = sub vp<%0>\l" + + "EMIT br vp<%0> vp<%1>\l" ] N0 -> N1 [ label=""] N1 [label = - "bb2:\l" + - " EMIT vp\<%3\> = mul vp\<%1\> vp\<%0\>\l" + - " EMIT ret vp\<%3\>\l" + - "No successors\l" + ":\n" + + "EMIT vp<%3> = mul vp<%1> vp<%0>\l" + + "EMIT ret vp<%3>\l" ] } )"; EXPECT_EQ(ExpectedStr, FullDump); - const char *ExpectedBlock1Str = R"(bb1: - EMIT vp<%0> = add - EMIT vp<%1> = sub vp<%0> - EMIT br vp<%0> vp<%1> -Successor(s): bb2 -)"; - std::string Block1Dump; - raw_string_ostream OS1(Block1Dump); - VPBB1->print(OS1); - EXPECT_EQ(ExpectedBlock1Str, Block1Dump); - - // Ensure that numbering is good when dumping the second block in isolation. - const char *ExpectedBlock2Str = R"(bb2: - EMIT vp<%3> = mul vp<%1> vp<%0> - EMIT ret vp<%3> -No successors -)"; - std::string Block2Dump; - raw_string_ostream OS2(Block2Dump); - VPBB2->print(OS2); - EXPECT_EQ(ExpectedBlock2Str, Block2Dump); - { std::string I3Dump; raw_string_ostream OS(I3Dump); -- GitLab From 16947650d5ca602d63d5cd64e68bb0bb0f3674b7 Mon Sep 17 00:00:00 2001 From: thomasraoux Date: Tue, 16 Mar 2021 14:14:51 -0700 Subject: [PATCH 0075/1000] [mlir][linalg] Extend linalg vectorization to support non-identity input maps This propagates the affine map to transfer_read op in case it is not a minor identity map. Differential Revision: https://reviews.llvm.org/D98523 --- .../Linalg/Transforms/Vectorization.cpp | 104 +++++++++++------- mlir/lib/Dialect/Vector/VectorOps.cpp | 3 +- mlir/lib/IR/AffineMap.cpp | 5 +- mlir/test/Dialect/Linalg/vectorization.mlir | 36 ++++++ 4 files changed, 103 insertions(+), 45 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index 880e7f385724..dab32d2e2727 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -87,11 +87,14 @@ static VectorType extractVectorTypeFromShapedValue(Value v) { /// Build a vector.transfer_read from `source` at indices set to all `0`. /// If source has rank zero, build an memref.load. /// Return the produced value. -static Value buildVectorRead(OpBuilder &builder, Value source) { +static Value buildVectorRead(OpBuilder &builder, Value source, + VectorType vectorType, AffineMap map) { edsc::ScopedContext scope(builder); auto shapedType = source.getType().cast(); - if (VectorType vectorType = extractVectorTypeFromShapedValue(source)) { + if (vectorType) { SmallVector indices(shapedType.getRank(), std_constant_index(0)); + if (map) + return vector_transfer_read(vectorType, source, indices, map); return vector_transfer_read(vectorType, source, indices); } return memref_load(source); @@ -238,6 +241,51 @@ vectorizeOneOp(OpBuilder &builder, Operation *op, builder.createOperation(state)}; } +/// Detect whether `r` has only ConstantOp, ElementwiseMappable and YieldOp. +static bool hasOnlyScalarElementwiseOp(Region &r) { + if (!llvm::hasSingleElement(r)) + return false; + for (Operation &op : r.front()) { + if (!(isa(op) || + OpTrait::hasElementwiseMappableTraits(&op)) || + llvm::any_of(op.getResultTypes(), + [](Type type) { return !type.isIntOrIndexOrFloat(); })) + return false; + } + return true; +} + +// Return true if the op is an element-wise linalg op. +static bool isElementwise(Operation *op) { + auto linalgOp = dyn_cast(op); + if (!linalgOp) + return false; + if (linalgOp.getNumLoops() != linalgOp.getNumParallelLoops()) + return false; + // TODO: relax the restrictions on indexing map. + for (unsigned i = 0, e = linalgOp.getNumOutputs(); i < e; i++) { + if (!linalgOp.getOutputIndexingMap(i).isIdentity()) + return false; + } + if (linalgOp->getNumRegions() != 1) + return false; + return hasOnlyScalarElementwiseOp(linalgOp->getRegion(0)); +} + +// Calculate the map to apply to transfer_read to convert the input shape into +// the output shape. +static AffineMap getTransferReadMap(LinalgOp linalgOp, unsigned argIndex) { + AffineMap linalgMap = linalgOp.getIndexingMap(argIndex); + MLIRContext *context = linalgMap.getContext(); + AffineExpr zero = mlir::getAffineConstantExpr(0, context); + SmallVector exprs(linalgMap.getNumInputs(), zero); + for (unsigned i : llvm::seq(unsigned(0), linalgMap.getNumResults())) { + exprs[linalgMap.getDimPosition(i)] = getAffineDimExpr(i, context); + } + return AffineMap::get(linalgMap.getNumResults(), /*symbolCount=*/0, exprs, + context); +} + /// Generic vectorization function that rewrites the body of a `linalgOp` into /// vector form. Generic vectorization proceeds as follows: /// 1. The region for the linalg op is created if necessary. @@ -282,7 +330,19 @@ LogicalResult vectorizeAsLinalgGeneric( SmallVector indexings; for (auto bbarg : block->getArguments()) { Value vectorArg = linalgOp.getShapedOperand(bbarg.getArgNumber()); - Value vectorRead = buildVectorRead(builder, vectorArg); + AffineMap map; + VectorType vectorType = extractVectorTypeFromShapedValue(vectorArg); + if (isElementwise(linalgOp) && + !linalgOp.getIndexingMap(bbarg.getArgNumber()).isMinorIdentity()) { + // Currently assume we don't support output permutations. + assert(linalgOp.getNumOutputs() > 0 && + linalgOp.getOutputIndexingMap(0).isIdentity()); + ArrayRef outputShape = + linalgOp.getOutputShapedType(0).getShape(); + vectorType = VectorType::get(outputShape, vectorType.getElementType()); + map = getTransferReadMap(linalgOp, bbarg.getArgNumber()); + } + Value vectorRead = buildVectorRead(builder, vectorArg, vectorType, map); LLVM_DEBUG(dbgs() << "\n[" DEBUG_TYPE "]: new vectorized bbarg(" << bbarg.getArgNumber() << "): " << vectorRead); bvm.map(bbarg, vectorRead); @@ -316,44 +376,6 @@ LogicalResult vectorizeAsLinalgGeneric( return success(); } -/// Detect whether `r` has only ConstantOp, ElementwiseMappable and YieldOp. -static bool hasOnlyScalarElementwiseOp(Region &r) { - if (!llvm::hasSingleElement(r)) - return false; - for (Operation &op : r.front()) { - if (!(isa(op) || - OpTrait::hasElementwiseMappableTraits(&op)) || - llvm::any_of(op.getResultTypes(), - [](Type type) { return !type.isIntOrIndexOrFloat(); })) - return false; - } - return true; -} - -// Return true if the op is an element-wise linalg op. -static bool isElementwise(Operation *op) { - auto linalgOp = dyn_cast(op); - if (!linalgOp) - return false; - if (linalgOp.getNumLoops() != linalgOp.getNumParallelLoops()) - return false; - // TODO: relax the restrictions on indexing map. - for (unsigned i = 0, e = linalgOp.getNumOutputs(); i < e; i++) { - if (!linalgOp.getOutputIndexingMap(i).isIdentity()) - return false; - } - // Currently bound the input indexing map to minor identity as other - // permutations might require adding transpose ops to convert the vector read - // to the right shape. - for (unsigned i = 0, e = linalgOp.getNumInputs(); i < e; i++) { - if (!linalgOp.getInputIndexingMap(i).isMinorIdentity()) - return false; - } - if (linalgOp->getNumRegions() != 1) - return false; - return hasOnlyScalarElementwiseOp(linalgOp->getRegion(0)); -} - static LogicalResult vectorizeContraction(OpBuilder &builder, LinalgOp linalgOp, SmallVectorImpl &newResults) { assert(isaContractionOpInterface(linalgOp) && diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp index 6ca28ba681ef..08bf7628e8c0 100644 --- a/mlir/lib/Dialect/Vector/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/VectorOps.cpp @@ -2294,8 +2294,7 @@ void TransferReadOp::build(OpBuilder &builder, OperationState &result, static void printTransferAttrs(OpAsmPrinter &p, VectorTransferOpInterface op) { SmallVector elidedAttrs; - if (op.permutation_map() == - getTransferMinorIdentityMap(op.getShapedType(), op.getVectorType())) + if (op.permutation_map().isMinorIdentity()) elidedAttrs.push_back(op.getPermutationMapAttrName()); bool elideMasked = true; if (auto maybeMasked = op.masked()) { diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp index 9de80e96d451..98ca45bbb6f6 100644 --- a/mlir/lib/IR/AffineMap.cpp +++ b/mlir/lib/IR/AffineMap.cpp @@ -106,8 +106,9 @@ AffineMap AffineMap::getMinorIdentityMap(unsigned dims, unsigned results, } bool AffineMap::isMinorIdentity() const { - return *this == - getMinorIdentityMap(getNumDims(), getNumResults(), getContext()); + return getNumDims() >= getNumResults() && + *this == + getMinorIdentityMap(getNumDims(), getNumResults(), getContext()); } /// Returns true if this affine map is a minor identity up to broadcasted diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir index c43bf07d775d..74ff4367724e 100644 --- a/mlir/test/Dialect/Linalg/vectorization.mlir +++ b/mlir/test/Dialect/Linalg/vectorization.mlir @@ -341,6 +341,42 @@ func @generic_vectorize_tensor(%arg0: tensor<4x256xf32>, // ----- +// Test different input maps. +#matmul_trait = { + indexing_maps = [ + affine_map<(d0, d1, d2, d3) -> (d1, d0)>, + affine_map<(d0, d1, d2, d3) -> (d3, d1)>, + affine_map<(d0, d1, d2, d3) -> (d3, d1, d0, d2)>, + affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> + ], + iterator_types = ["parallel", "parallel", "parallel", "parallel"] +} + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d1, d0, 0, 0)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (0, d1, 0, d0)> +// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d2, d1, d3, d0)> +// CHECK: func @vectorization_transpose +// CHECK: vector.transfer_read {{.*}}{permutation_map = #[[MAP0]]} : memref<14x7xf32>, vector<7x14x8x16xf32> +// CHECK: vector.transfer_read {{.*}}{permutation_map = #[[MAP1]]} : memref<16x14xf32>, vector<7x14x8x16xf32> +// CHECK: vector.transfer_read {{.*}}{permutation_map = #[[MAP2]]} : memref<16x14x7x8xf32>, vector<7x14x8x16xf32> +// CHECK: addf {{.*}} : vector<7x14x8x16xf32> +// CHECK: addf {{.*}} : vector<7x14x8x16xf32> +// CHECK: vector.transfer_write {{.*}} : vector<7x14x8x16xf32>, memref<7x14x8x16xf32> +func @vectorization_transpose(%A: memref<14x7xf32>, %B: memref<16x14xf32>, + %C: memref<16x14x7x8xf32>, %D: memref<7x14x8x16xf32>) { + linalg.generic #matmul_trait + ins(%A, %B, %C : memref<14x7xf32>, memref<16x14xf32>, memref<16x14x7x8xf32>) + outs(%D : memref<7x14x8x16xf32>) { + ^bb(%a: f32, %b: f32, %c: f32, %d: f32) : + %e = addf %a, %b: f32 + %f = addf %e, %c: f32 + linalg.yield %f : f32 + } + return +} + +// ----- + // CHECK-LABEL: func @matmul_tensors // CHECK-SAME: (%[[ARG0:.*]]: tensor<8x4xf32>, %[[ARG1:.*]]: tensor<4x12xf32>, // CHECK-SAME: %[[ARG2:.*]]: tensor<8x12xf32>) -> tensor<8x12xf32> -- GitLab From 92068d6c31a45315402e4cefd3ec1c340090b41d Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 18 Mar 2021 15:19:00 -0400 Subject: [PATCH 0076/1000] [SimplifyCFG] add tests for branch cond merging with prof metadata; NFC See PR49336. --- .../SimplifyCFG/preserve-branchweights.ll | 417 +++++++++++++++++- 1 file changed, 408 insertions(+), 9 deletions(-) diff --git a/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll index 25106d435cc7..657accc9b6c2 100644 --- a/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll +++ b/llvm/test/Transforms/SimplifyCFG/preserve-branchweights.ll @@ -318,10 +318,10 @@ define void @test8(i64 %x, i64 %y) nounwind { ; CHECK-NEXT: [[LT:%.*]] = icmp slt i64 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: br i1 [[LT]], label [[A:%.*]], label [[B:%.*]], !prof !7 ; CHECK: a: -; CHECK-NEXT: call void @helper(i32 0) [[ATTR1:#.*]] +; CHECK-NEXT: call void @helper(i32 0) #[[ATTR1:[0-9]+]] ; CHECK-NEXT: ret void ; CHECK: b: -; CHECK-NEXT: call void @helper(i32 1) [[ATTR1]] +; CHECK-NEXT: call void @helper(i32 1) #[[ATTR1]] ; CHECK-NEXT: ret void ; entry: @@ -355,14 +355,14 @@ define i1 @test9(i32 %x, i32 %y) nounwind { ; CHECK-NEXT: i32 92, label [[END]] ; CHECK-NEXT: ], !prof !8 ; CHECK: a: -; CHECK-NEXT: call void @helper(i32 0) [[ATTR1]] +; CHECK-NEXT: call void @helper(i32 0) #[[ATTR1]] ; CHECK-NEXT: [[RETA:%.*]] = icmp slt i32 [[X]], [[Y:%.*]] ; CHECK-NEXT: ret i1 [[RETA]] ; CHECK: bees: ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[RET:%.*]] = phi i1 [ true, [[ENTRY:%.*]] ], [ false, [[BEES]] ], [ true, [[ENTRY]] ], [ true, [[ENTRY]] ] -; CHECK-NEXT: call void @helper(i32 2) [[ATTR1]] +; CHECK-NEXT: call void @helper(i32 2) #[[ATTR1]] ; CHECK-NEXT: ret i1 [[RET]] ; entry: @@ -394,10 +394,10 @@ define void @test10(i32 %x) nounwind readnone ssp noredzone { ; CHECK-NEXT: [[SWITCH:%.*]] = icmp ult i32 [[X_OFF]], 3 ; CHECK-NEXT: br i1 [[SWITCH]], label [[LOR_END:%.*]], label [[LOR_RHS:%.*]], !prof !9 ; CHECK: lor.rhs: -; CHECK-NEXT: call void @helper(i32 1) [[ATTR1]] +; CHECK-NEXT: call void @helper(i32 1) #[[ATTR1]] ; CHECK-NEXT: ret void ; CHECK: lor.end: -; CHECK-NEXT: call void @helper(i32 0) [[ATTR1]] +; CHECK-NEXT: call void @helper(i32 0) #[[ATTR1]] ; CHECK-NEXT: ret void ; entry: @@ -424,10 +424,10 @@ define void @test11(i32 %x) nounwind { ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[I]], 24 ; CHECK-NEXT: br i1 [[COND]], label [[C:%.*]], label [[A:%.*]], !prof !10 ; CHECK: a: -; CHECK-NEXT: call void @helper(i32 0) [[ATTR1]] +; CHECK-NEXT: call void @helper(i32 0) #[[ATTR1]] ; CHECK-NEXT: ret void ; CHECK: c: -; CHECK-NEXT: call void @helper(i32 2) [[ATTR1]] +; CHECK-NEXT: call void @helper(i32 2) #[[ATTR1]] ; CHECK-NEXT: ret void ; %i = shl i32 %x, 1 @@ -472,7 +472,7 @@ sw.epilog: define void @test13(i32 %x) nounwind { ; CHECK-LABEL: @test13( ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @helper(i32 0) [[ATTR1]] +; CHECK-NEXT: call void @helper(i32 0) #[[ATTR1]] ; CHECK-NEXT: ret void ; entry: @@ -636,6 +636,400 @@ exit: ret i32 %outval } +; FIXME: Merging the icmps with logic-op defeats the purpose of the metadata. +; We can't tell which condition is expensive if they are combined. + +define void @or_icmps_harmful(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @or_icmps_harmful( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[EXPECTED_TRUE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof !19 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_true = icmp sgt i32 %x, -1 + br i1 %expected_true, label %exit, label %rare, !prof !15 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %exit, label %false + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +; FIXME: Merging the icmps with logic-op defeats the purpose of the metadata. +; We can't tell which condition is expensive if they are combined. + +define void @or_icmps_harmful_inverted(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @or_icmps_harmful_inverted( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_FALSE:%.*]] = icmp sle i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[EXPECTED_FALSE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof !19 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_false = icmp sgt i32 %x, -1 + br i1 %expected_false, label %rare, label %exit, !prof !16 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %exit, label %false + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +; The probability threshold is set by a builtin_expect setting. + +define void @or_icmps_not_that_harmful(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @or_icmps_not_that_harmful( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[EXPECTED_TRUE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof !20 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_true = icmp sgt i32 %x, -1 + br i1 %expected_true, label %exit, label %rare, !prof !17 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %exit, label %false + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +define void @or_icmps_not_that_harmful_inverted(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @or_icmps_not_that_harmful_inverted( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[EXPECTED_TRUE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof !21 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_true = icmp sgt i32 %x, -1 + br i1 %expected_true, label %exit, label %rare, !prof !18 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %exit, label %false + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +define void @or_icmps_useful(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @or_icmps_useful( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_TRUE:%.*]] = icmp sle i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[EXPECTED_TRUE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof !22 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_true = icmp sgt i32 %x, -1 + br i1 %expected_true, label %likely, label %exit, !prof !15 + +likely: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %exit, label %false + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +define void @or_icmps_useful_inverted(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @or_icmps_useful_inverted( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_FALSE:%.*]] = icmp sgt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[EXPECTED_FALSE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT:%.*]], label [[FALSE:%.*]], !prof !22 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_false = icmp sgt i32 %x, -1 + br i1 %expected_false, label %exit, label %likely, !prof !16 + +likely: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %exit, label %false + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +; Don't crash processing degenerate metadata. + +define void @or_icmps_empty_metadata(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @or_icmps_empty_metadata( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[EXPECTED_TRUE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT:%.*]], label [[MORE_RARE:%.*]] +; CHECK: more_rare: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_true = icmp sgt i32 %x, -1 + br i1 %expected_true, label %exit, label %rare, !prof !19 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %exit, label %more_rare + +more_rare: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +; FIXME: Merging the icmps with logic-op defeats the purpose of the metadata. +; We can't tell which condition is expensive if they are combined. + +define void @and_icmps_harmful(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @and_icmps_harmful( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_FALSE:%.*]] = icmp sgt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[EXPECTED_FALSE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof !23 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_false = icmp sgt i32 %x, -1 + br i1 %expected_false, label %rare, label %exit, !prof !16 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %false, label %exit + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +; FIXME: Merging the icmps with logic-op defeats the purpose of the metadata. +; We can't tell which condition is expensive if they are combined. + +define void @and_icmps_harmful_inverted(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @and_icmps_harmful_inverted( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_TRUE:%.*]] = icmp sle i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[EXPECTED_TRUE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof !23 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_true = icmp sgt i32 %x, -1 + br i1 %expected_true, label %exit, label %rare, !prof !15 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %false, label %exit + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +define void @and_icmps_not_that_harmful(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @and_icmps_not_that_harmful( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_FALSE:%.*]] = icmp sgt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[EXPECTED_FALSE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof !24 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_false = icmp sgt i32 %x, -1 + br i1 %expected_false, label %rare, label %exit, !prof !18 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %false, label %exit + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +define void @and_icmps_not_that_harmful_inverted(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @and_icmps_not_that_harmful_inverted( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_TRUE:%.*]] = icmp sle i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[EXPECTED_TRUE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof !24 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_true = icmp sgt i32 %x, -1 + br i1 %expected_true, label %exit, label %rare, !prof !17 + +rare: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %false, label %exit + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +define void @and_icmps_useful(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @and_icmps_useful( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_TRUE:%.*]] = icmp sgt i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[EXPECTED_TRUE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof !25 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_true = icmp sgt i32 %x, -1 + br i1 %expected_true, label %likely, label %exit, !prof !15 + +likely: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %false, label %exit + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + +define void @and_icmps_useful_inverted(i32 %x, i32 %y, i8* %p) { +; CHECK-LABEL: @and_icmps_useful_inverted( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[EXPECTED_FALSE:%.*]] = icmp sle i32 [[X:%.*]], -1 +; CHECK-NEXT: [[EXPENSIVE:%.*]] = icmp eq i32 [[Y:%.*]], 0 +; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[EXPECTED_FALSE]], [[EXPENSIVE]] +; CHECK-NEXT: br i1 [[OR_COND]], label [[FALSE:%.*]], label [[EXIT:%.*]], !prof !25 +; CHECK: false: +; CHECK-NEXT: store i8 42, i8* [[P:%.*]], align 1 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %expected_false = icmp sgt i32 %x, -1 + br i1 %expected_false, label %exit, label %likely, !prof !16 + +likely: + %expensive = icmp eq i32 %y, 0 + br i1 %expensive, label %false, label %exit + +false: + store i8 42, i8* %p, align 1 + br label %exit + +exit: + ret void +} + + !0 = !{!"branch_weights", i32 3, i32 5} !1 = !{!"branch_weights", i32 1, i32 1} !2 = !{!"branch_weights", i32 1, i32 2} @@ -651,6 +1045,11 @@ exit: !12 = !{!"these_are_not_the_branch_weights_you_are_looking_for", i32 3, i32 5} !13 = !{!"branch_weights", i32 2, i32 3} !14 = !{!"branch_weights", i32 4, i32 7} +!15 = !{!"branch_weights", i32 2000, i32 1} +!16 = !{!"branch_weights", i32 1, i32 2000} +!17 = !{!"branch_weights", i32 1999, i32 1} +!18 = !{!"branch_weights", i32 1, i32 1999} +!19 = !{!"branch_weights", i32 0, i32 0} ; CHECK: !0 = !{!"branch_weights", i32 5, i32 11} ; CHECK: !1 = !{!"branch_weights", i32 1, i32 3} -- GitLab From 0c208d1f42be3fcbca37729cafcab5e97ce0a8e2 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Thu, 18 Mar 2021 19:40:48 +0100 Subject: [PATCH 0077/1000] [lldb] Fix flakyness in TestGdbRemote_vContThreads The cause is the non-async-signal-safety printf function (et al.). If the test managed to interrupt the process and inject a signal before the printf("@started") call returned (but after it has actually written the output), that string could end up being printed twice (presumably, because the function did not manage the clear the userspace buffer, and so the print call in the signal handler would print it once again). This patch fixes the issue by replacing the printf call in the signal handler with a sprintf+write combo, which should not suffer from that problem (though I wouldn't go as far as to call it async signal safe). --- lldb/test/API/tools/lldb-server/main.cpp | 43 +++++++++--------------- 1 file changed, 16 insertions(+), 27 deletions(-) diff --git a/lldb/test/API/tools/lldb-server/main.cpp b/lldb/test/API/tools/lldb-server/main.cpp index 4a877deaafbd..8a14c11075f1 100644 --- a/lldb/test/API/tools/lldb-server/main.cpp +++ b/lldb/test/API/tools/lldb-server/main.cpp @@ -70,25 +70,23 @@ static void print_pid() { #endif } -static void print_thread_id() { +static uint64_t get_thread_id() { // Put in the right magic here for your platform to spit out the thread id (tid) -// that debugserver/lldb-gdbserver would see as a TID. Otherwise, let the else -// clause print out the unsupported text so that the unit test knows to skip -// verifying thread ids. +// that debugserver/lldb-gdbserver would see as a TID. #if defined(__APPLE__) __uint64_t tid = 0; pthread_threadid_np(pthread_self(), &tid); - printf("%" PRIx64, tid); + return tid; #elif defined(__linux__) // This is a call to gettid() via syscall. - printf("%" PRIx64, static_cast(syscall(__NR_gettid))); + return syscall(__NR_gettid); #elif defined(__NetBSD__) // Technically lwpid_t is 32-bit signed integer - printf("%" PRIx64, static_cast(_lwp_self())); + return static_cast(_lwp_self()); #elif defined(_WIN32) - printf("%" PRIx64, static_cast(::GetCurrentThreadId())); + return static_cast(::GetCurrentThreadId()); #else - printf("{no-tid-support}"); + return -1; #endif } @@ -109,15 +107,12 @@ static void signal_handler(int signo) { } // Print notice that we received the signal on a given thread. - { - std::lock_guard lock(g_print_mutex); - if (signal_name) - printf("received %s on thread id: ", signal_name); - else - printf("received signo %d (%s) on thread id: ", signo, strsignal(signo)); - print_thread_id(); - printf("\n"); - } + char buf[100]; + if (signal_name) + snprintf(buf, sizeof(buf), "received %s on thread id: %" PRIx64 "\n", signal_name, get_thread_id()); + else + snprintf(buf, sizeof(buf), "received signo %d (%s) on thread id: %" PRIx64 "\n", signo, strsignal(signo), get_thread_id()); + write(STDOUT_FILENO, buf, strlen(buf)); // Reset the signal handler if we're one of the expected signal handlers. switch (signo) { @@ -195,9 +190,7 @@ static void *thread_func(void *arg) { const int this_thread_index = s_thread_index++; if (g_print_thread_ids) { std::lock_guard lock(g_print_mutex); - printf("thread %d id: ", this_thread_index); - print_thread_id(); - printf("\n"); + printf("thread %d id: %" PRIx64 "\n", this_thread_index, get_thread_id()); } if (g_threads_do_segfault) { @@ -229,9 +222,7 @@ static void *thread_func(void *arg) { { std::lock_guard lock(g_print_mutex); - printf("thread "); - print_thread_id(); - printf(": past SIGSEGV\n"); + printf("thread %" PRIx64 ": past SIGSEGV\n", get_thread_id()); } } @@ -362,9 +353,7 @@ int main(int argc, char **argv) { // And announce us. { std::lock_guard lock(g_print_mutex); - printf("thread 0 id: "); - print_thread_id(); - printf("\n"); + printf("thread 0 id: %" PRIx64 "\n", get_thread_id()); } } else if (std::strstr(argv[i] + strlen(THREAD_PREFIX), THREAD_COMMAND_SEGFAULT)) { -- GitLab From 1a572f4509a6fb392e87b7ea0346344bf6b8ac66 Mon Sep 17 00:00:00 2001 From: thomasraoux Date: Thu, 18 Mar 2021 12:59:49 -0700 Subject: [PATCH 0078/1000] [mlir] Add vector op support to cuda-runner including vector.print Differential Revision: https://reviews.llvm.org/D97346 --- .../Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp | 2 ++ mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp index 44dfd730e44f..0e3bf166c47e 100644 --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -18,6 +18,7 @@ #include "../PassDetail.h" #include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" +#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/Async/IR/Async.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/GPU/Passes.h" @@ -313,6 +314,7 @@ void GpuToLLVMConversionPass::runOnOperation() { OwningRewritePatternList patterns; LLVMConversionTarget target(getContext()); + populateVectorToLLVMConversionPatterns(converter, patterns); populateStdToLLVMConversionPatterns(converter, patterns); populateAsyncStructuralTypeConversionsAndLegality(&getContext(), converter, patterns, target); diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index ec9720f55666..c4ad89778d97 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -21,7 +21,10 @@ func @other_func(%arg0 : f32, %arg1 : memref) { } // CHECK: [1, 1, 1, 1, 1] +// CHECK: ( 1, 1 ) func @main() { + %v0 = constant 0.0 : f32 + %c0 = constant 0: index %arg0 = memref.alloc() : memref<5xf32> %21 = constant 5 : i32 %22 = memref.cast %arg0 : memref<5xf32> to memref @@ -31,6 +34,8 @@ func @main() { %24 = constant 1.0 : f32 call @other_func(%24, %22) : (f32, memref) -> () call @print_memref_f32(%23) : (memref<*xf32>) -> () + %val1 = vector.transfer_read %arg0[%c0], %v0: memref<5xf32>, vector<2xf32> + vector.print %val1: vector<2xf32> return } -- GitLab From 1c740b29fae3962a9c8644496352b10798d925ef Mon Sep 17 00:00:00 2001 From: Zequan Wu Date: Thu, 18 Mar 2021 12:27:42 -0700 Subject: [PATCH 0079/1000] [clang-cl] make -ffile-compilation-dir a CoreOption. Let clang-cl accepts `-ffile-compilation-dir` flag. Differential Revision: https://reviews.llvm.org/D98887 --- clang/include/clang/Driver/Options.td | 1 + clang/test/Driver/cl-options.c | 1 + 2 files changed, 2 insertions(+) diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 9c5013ee88d9..a9b43a8fe620 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1125,6 +1125,7 @@ def fcoverage_compilation_dir_EQ : Joined<["-"], "fcoverage-compilation-dir=">, HelpText<"The compilation directory to embed in the coverage mapping.">, MarshallingInfoString>; def ffile_compilation_dir_EQ : Joined<["-"], "ffile-compilation-dir=">, Group, + Flags<[CoreOption]>, HelpText<"The compilation directory to embed in the debug info and coverage mapping.">; defm debug_info_for_profiling : BoolFOption<"debug-info-for-profiling", CodeGenOpts<"DebugInfoForProfiling">, DefaultFalse, diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c index 7d83b3d60b1e..90f865d9c7c0 100644 --- a/clang/test/Driver/cl-options.c +++ b/clang/test/Driver/cl-options.c @@ -643,6 +643,7 @@ // RUN: -fno-diagnostics-color \ // RUN: -fdebug-compilation-dir . \ // RUN: -fdebug-compilation-dir=. \ +// RUN: -ffile-compilation-dir=. \ // RUN: -fdiagnostics-parseable-fixits \ // RUN: -fdiagnostics-absolute-paths \ // RUN: -ferror-limit=10 \ -- GitLab From 32a744ab20f37681f71ca9098625994515f0f4ab Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 15 Mar 2021 16:52:31 -0700 Subject: [PATCH 0080/1000] [mlir] Add linalg.fill bufferization conversion `BufferizeAnyLinalgOp` fails because `FillOp` is not a `LinalgGenericOp` and it fails while reading operand sizes attribute. Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D98671 --- .../Dialect/Linalg/Transforms/Bufferize.cpp | 31 +++++++++++++++---- mlir/test/Dialect/Linalg/bufferize.mlir | 13 ++++++++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp index 32b2ee706d19..419226b35179 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp @@ -32,8 +32,7 @@ static Value cloneMemref(Location loc, Value memref, OpBuilder &b) { } static LogicalResult -allocateBuffersForResults(Location loc, LinalgOp linalgOp, - linalg::GenericOpAdaptor &adaptor, +allocateBuffersForResults(Location loc, LinalgOp linalgOp, ValueRange outputs, SmallVectorImpl &resultBuffers, OpBuilder &b) { // Lazily compute loopRanges. SmallVector loopRanges; @@ -52,7 +51,7 @@ allocateBuffersForResults(Location loc, LinalgOp linalgOp, } auto tensorShape = tensorType.getShape(); auto memrefType = MemRefType::get(tensorShape, tensorType.getElementType()); - Value resultTensor = adaptor.outputs()[resultIndex]; + Value resultTensor = outputs[resultIndex]; // Clone output buffers whose value is actually used. if (linalgOp.payloadUsesValueFromOutputOperandIndex(resultIndex)) { @@ -138,8 +137,7 @@ static void finalizeBufferAllocation(ConversionPatternRewriter &rewriter, namespace { -/// Generic conversion pattern that matches any LinalgOp. This avoids template -/// instantiating one pattern for each LinalgOp. +/// Conversion pattern that replaces `linalg.init_tensor` with allocation. class BufferizeInitTensorOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -155,6 +153,26 @@ public: } }; +/// Conversion pattern that bufferizes `linalg.fill` operation. +class BufferizeFillOp : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(FillOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + linalg::FillOpAdaptor adaptor(operands, op->getAttrDictionary()); + if (!op.output().getType().isa()) + return rewriter.notifyMatchFailure(op, + "operand must be of a tensor type"); + + rewriter.create(op.getLoc(), adaptor.output(), adaptor.value()); + rewriter.replaceOp(op, adaptor.output()); + + return success(); + } +}; + /// Generic conversion pattern that matches any LinalgOp. This avoids template /// instantiating one pattern for each LinalgOp. class BufferizeAnyLinalgOp : public ConversionPattern { @@ -178,7 +196,7 @@ public: Location loc = linalgOp.getLoc(); SmallVector newOutputBuffers; - if (failed(allocateBuffersForResults(loc, linalgOp, adaptor, + if (failed(allocateBuffersForResults(loc, linalgOp, adaptor.outputs(), newOutputBuffers, rewriter))) { linalgOp.emitOpError() << "Failed to allocate buffers for tensor results."; @@ -325,6 +343,7 @@ void mlir::linalg::populateLinalgBufferizePatterns( // TODO: Drop this once tensor constants work in standard. // clang-format off patterns.insert< + BufferizeFillOp, BufferizeInitTensorOp, SubTensorOpConverter, SubTensorInsertOpConverter diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir index e0027b765d25..1c7cec1de07a 100644 --- a/mlir/test/Dialect/Linalg/bufferize.mlir +++ b/mlir/test/Dialect/Linalg/bufferize.mlir @@ -265,3 +265,16 @@ func @bufferize_subtensor_insert(%t : tensor, %st0 : tensor<2x3xf32>, % return %t0, %t1: tensor, tensor } +// ----- + +// CHECK-LABEL: func @bufferize_fill( +// CHECK-SAME: %[[IN:.*]]: tensor +func @bufferize_fill(%arg0: tensor) -> tensor { + %c0 = constant 0.0 : f32 + // CHECK: %[[MEMREF:.*]] = tensor_to_memref %[[IN]] : memref + // CHECK: linalg.fill(%[[MEMREF]], %cst) : memref, f32 + // CHECK: %[[TENSOR:.*]] = tensor_load %[[MEMREF]] : memref + // CHECK: return %[[TENSOR]] + %0 = linalg.fill(%arg0, %c0) : tensor, f32 -> tensor + return %0 : tensor +} -- GitLab From 36335fe753690c20c73a48a168d4b11feb3810a8 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 18 Mar 2021 13:54:56 -0700 Subject: [PATCH 0081/1000] [lldb] Move Apple simulators test targets under API Move the Apple simulators test targets as they only matter for the API tests. Differential revision: https://reviews.llvm.org/D98880 --- lldb/test/API/CMakeLists.txt | 22 ++++++++++++++++++++++ lldb/test/CMakeLists.txt | 23 ----------------------- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/lldb/test/API/CMakeLists.txt b/lldb/test/API/CMakeLists.txt index 001712fcfbce..0dbc46defc81 100644 --- a/lldb/test/API/CMakeLists.txt +++ b/lldb/test/API/CMakeLists.txt @@ -167,3 +167,25 @@ if (CMAKE_GENERATOR STREQUAL "Xcode") ${CMAKE_CURRENT_BINARY_DIR} DEPENDS lldb-test-deps) endif() + +# Targets for running the test suite on the different Apple simulators. +add_lit_testsuite(check-lldb-simulator-ios + "Running lldb test suite on the iOS simulator" + ${CMAKE_CURRENT_BINARY_DIR} + PARAMS "lldb-run-with-simulator=ios" + EXCLUDE_FROM_CHECK_ALL + DEPENDS lldb-test-deps) + +add_lit_testsuite(check-lldb-simulator-watchos + "Running lldb test suite on the watchOS simulator" + ${CMAKE_CURRENT_BINARY_DIR} + PARAMS "lldb-run-with-simulator=watchos" + EXCLUDE_FROM_CHECK_ALL + DEPENDS lldb-test-deps) + +add_lit_testsuite(check-lldb-simulator-tvos + "Running lldb test suite on the tvOS simulator" + ${CMAKE_CURRENT_BINARY_DIR} + PARAMS "lldb-run-with-simulator=tvos" + EXCLUDE_FROM_CHECK_ALL + DEPENDS lldb-test-deps) diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index 9944e37a46fc..8363bde23035 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -220,29 +220,6 @@ add_lit_testsuite(check-lldb-reproducers DEPENDS lldb-test-deps) add_dependencies(check-lldb-reproducers check-lldb-reproducers-capture) -# Targets for running the test suite on the different Apple simulators. -add_lit_testsuite(check-lldb-simulator-ios - "Running lldb test suite on the iOS simulator" - ${CMAKE_CURRENT_BINARY_DIR}/API - PARAMS "lldb-run-with-simulator=ios" - EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-test-deps) - -add_lit_testsuite(check-lldb-simulator-watchos - "Running lldb test suite on the watchOS simulator" - ${CMAKE_CURRENT_BINARY_DIR}/API - PARAMS "lldb-run-with-simulator=watchos" - EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-test-deps) - -add_lit_testsuite(check-lldb-simulator-tvos - "Running lldb test suite on the tvOS simulator" - ${CMAKE_CURRENT_BINARY_DIR}/API - PARAMS "lldb-run-with-simulator=tvos" - EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-test-deps) - - if(LLDB_BUILT_STANDALONE) # This has to happen *AFTER* add_lit_testsuite. if (EXISTS ${LLVM_MAIN_SRC_DIR}/utils/llvm-lit) -- GitLab From c1940aac99ea4ea8420dff00a55065a94f1a1195 Mon Sep 17 00:00:00 2001 From: Daniel Kiss Date: Thu, 18 Mar 2021 21:55:39 +0100 Subject: [PATCH 0082/1000] Revert "[AArch64][compiler-rt] Strip PAC from the link register." This reverts commit ad40453fc425ee8e1fe43c7bb6e3c1c3afa9cc3b. --- .../lib/sanitizer_common/sanitizer_ptrauth.h | 20 +---------------- .../sanitizer_common/sanitizer_stacktrace.cpp | 3 +-- compiler-rt/lib/tsan/rtl/tsan_external.cpp | 4 ++-- compiler-rt/lib/tsan/rtl/tsan_interface.cpp | 8 +++---- compiler-rt/lib/tsan/rtl/tsan_interface_inl.h | 22 +++++++++---------- 5 files changed, 19 insertions(+), 38 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h index b6a8bef06ee4..a288068bf943 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h @@ -11,24 +11,6 @@ #if __has_feature(ptrauth_calls) #include -#elif defined(__ARM_FEATURE_PAC_DEFAULT) && !defined(__APPLE__) -inline unsigned long ptrauth_strip(unsigned long __value, unsigned int __key) { - // On the stack the link register is protected with Pointer - // Authentication Code when compiled with -mbranch-protection. - // Let's stripping the PAC unconditionally because xpaclri is in - // the NOP space so will do nothing when it is not enabled or not available. - unsigned long ret; - asm volatile( - "mov x30, %1\n\t" - "hint #7\n\t" // xpaclri - "mov %0, x30\n\t" - : "=r"(ret) - : "r"(__value) - : "x30"); - return ret; -} -#define ptrauth_auth_data(__value, __old_key, __old_data) __value -#define ptrauth_string_discriminator(__string) ((int)0) #else // Copied from #define ptrauth_strip(__value, __key) __value @@ -36,6 +18,6 @@ inline unsigned long ptrauth_strip(unsigned long __value, unsigned int __key) { #define ptrauth_string_discriminator(__string) ((int)0) #endif -#define STRIP_PAC_PC(pc) ((uptr)ptrauth_strip((uptr)pc, 0)) +#define STRIP_PC(pc) ((uptr)ptrauth_strip(pc, 0)) #endif // SANITIZER_PTRAUTH_H diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp index ea0d49ac2e8f..b0487d8987db 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp @@ -15,7 +15,6 @@ #include "sanitizer_common.h" #include "sanitizer_flags.h" #include "sanitizer_platform.h" -#include "sanitizer_ptrauth.h" namespace __sanitizer { @@ -123,7 +122,7 @@ void BufferedStackTrace::UnwindFast(uptr pc, uptr bp, uptr stack_top, // frame[-1] contains the return address uhwptr pc1 = frame[-1]; #else - uhwptr pc1 = STRIP_PAC_PC(frame[1]); + uhwptr pc1 = frame[1]; #endif // Let's assume that any pointer in the 0th page (i.e. <0x1000 on i386 and // x86_64) is invalid and stop unwinding here. If we're adding support for diff --git a/compiler-rt/lib/tsan/rtl/tsan_external.cpp b/compiler-rt/lib/tsan/rtl/tsan_external.cpp index a87e12f2936f..466b2bf0f66c 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_external.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_external.cpp @@ -111,12 +111,12 @@ void __tsan_external_assign_tag(void *addr, void *tag) { SANITIZER_INTERFACE_ATTRIBUTE void __tsan_external_read(void *addr, void *caller_pc, void *tag) { - ExternalAccess(addr, STRIP_PAC_PC(caller_pc), tag, MemoryRead); + ExternalAccess(addr, STRIP_PC(caller_pc), tag, MemoryRead); } SANITIZER_INTERFACE_ATTRIBUTE void __tsan_external_write(void *addr, void *caller_pc, void *tag) { - ExternalAccess(addr, STRIP_PAC_PC(caller_pc), tag, MemoryWrite); + ExternalAccess(addr, STRIP_PC(caller_pc), tag, MemoryWrite); } } // extern "C" diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface.cpp index 9bd0e8580b17..55f1c9834f70 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interface.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interface.cpp @@ -40,13 +40,13 @@ void __tsan_write16(void *addr) { } void __tsan_read16_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8); - MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr + 8, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr + 8, kSizeLog8); } void __tsan_write16_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8); - MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr + 8, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr + 8, kSizeLog8); } // __tsan_unaligned_read/write calls are emitted by compiler. diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h b/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h index 5e77d4d3d288..f5d743c10772 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h +++ b/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h @@ -51,35 +51,35 @@ void __tsan_write8(void *addr) { } void __tsan_read1_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog1); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog1); } void __tsan_read2_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog2); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog2); } void __tsan_read4_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog4); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog4); } void __tsan_read8_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); } void __tsan_write1_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog1); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog1); } void __tsan_write2_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog2); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog2); } void __tsan_write4_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog4); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog4); } void __tsan_write8_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); } void __tsan_vptr_update(void **vptr_p, void *new_val) { @@ -101,7 +101,7 @@ void __tsan_vptr_read(void **vptr_p) { } void __tsan_func_entry(void *pc) { - FuncEntry(cur_thread(), STRIP_PAC_PC(pc)); + FuncEntry(cur_thread(), STRIP_PC(pc)); } void __tsan_func_exit() { @@ -125,9 +125,9 @@ void __tsan_write_range(void *addr, uptr size) { } void __tsan_read_range_pc(void *addr, uptr size, void *pc) { - MemoryAccessRange(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, size, false); + MemoryAccessRange(cur_thread(), STRIP_PC(pc), (uptr)addr, size, false); } void __tsan_write_range_pc(void *addr, uptr size, void *pc) { - MemoryAccessRange(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, size, true); + MemoryAccessRange(cur_thread(), STRIP_PC(pc), (uptr)addr, size, true); } -- GitLab From 4220531ceff0742851b8a2a5836400a7a521491d Mon Sep 17 00:00:00 2001 From: Daniel Kiss Date: Mon, 15 Mar 2021 10:24:44 +0100 Subject: [PATCH 0083/1000] [AArch64][compiler-rt] Strip PAC from the link register. -mbranch-protection protects the LR on the stack with PAC. When the frames are walked the LR need to be cleared. This inline assembly later will be replaced with a new builtin. Test: build with -DCMAKE_C_FLAGS="-mbranch-protection=standard". Reviewed By: kubamracek Differential Revision: https://reviews.llvm.org/D98008 --- .../lib/sanitizer_common/sanitizer_ptrauth.h | 20 ++++++++++++++++- .../sanitizer_common/sanitizer_stacktrace.cpp | 3 ++- compiler-rt/lib/tsan/rtl/tsan_external.cpp | 4 ++-- compiler-rt/lib/tsan/rtl/tsan_interface.cpp | 8 +++---- compiler-rt/lib/tsan/rtl/tsan_interface_inl.h | 22 +++++++++---------- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h index a288068bf943..520035469485 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_ptrauth.h @@ -11,6 +11,24 @@ #if __has_feature(ptrauth_calls) #include +#elif defined(__ARM_FEATURE_PAC_DEFAULT) && !defined(__APPLE__) +inline unsigned long ptrauth_strip(void* __value, unsigned int __key) { + // On the stack the link register is protected with Pointer + // Authentication Code when compiled with -mbranch-protection. + // Let's stripping the PAC unconditionally because xpaclri is in + // the NOP space so will do nothing when it is not enabled or not available. + unsigned long ret; + asm volatile( + "mov x30, %1\n\t" + "hint #7\n\t" // xpaclri + "mov %0, x30\n\t" + : "=r"(ret) + : "r"(__value) + : "x30"); + return ret; +} +#define ptrauth_auth_data(__value, __old_key, __old_data) __value +#define ptrauth_string_discriminator(__string) ((int)0) #else // Copied from #define ptrauth_strip(__value, __key) __value @@ -18,6 +36,6 @@ #define ptrauth_string_discriminator(__string) ((int)0) #endif -#define STRIP_PC(pc) ((uptr)ptrauth_strip(pc, 0)) +#define STRIP_PAC_PC(pc) ((uptr)ptrauth_strip(pc, 0)) #endif // SANITIZER_PTRAUTH_H diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp index b0487d8987db..07e4409f4a5d 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_stacktrace.cpp @@ -15,6 +15,7 @@ #include "sanitizer_common.h" #include "sanitizer_flags.h" #include "sanitizer_platform.h" +#include "sanitizer_ptrauth.h" namespace __sanitizer { @@ -122,7 +123,7 @@ void BufferedStackTrace::UnwindFast(uptr pc, uptr bp, uptr stack_top, // frame[-1] contains the return address uhwptr pc1 = frame[-1]; #else - uhwptr pc1 = frame[1]; + uhwptr pc1 = STRIP_PAC_PC((void *)frame[1]); #endif // Let's assume that any pointer in the 0th page (i.e. <0x1000 on i386 and // x86_64) is invalid and stop unwinding here. If we're adding support for diff --git a/compiler-rt/lib/tsan/rtl/tsan_external.cpp b/compiler-rt/lib/tsan/rtl/tsan_external.cpp index 466b2bf0f66c..a87e12f2936f 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_external.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_external.cpp @@ -111,12 +111,12 @@ void __tsan_external_assign_tag(void *addr, void *tag) { SANITIZER_INTERFACE_ATTRIBUTE void __tsan_external_read(void *addr, void *caller_pc, void *tag) { - ExternalAccess(addr, STRIP_PC(caller_pc), tag, MemoryRead); + ExternalAccess(addr, STRIP_PAC_PC(caller_pc), tag, MemoryRead); } SANITIZER_INTERFACE_ATTRIBUTE void __tsan_external_write(void *addr, void *caller_pc, void *tag) { - ExternalAccess(addr, STRIP_PC(caller_pc), tag, MemoryWrite); + ExternalAccess(addr, STRIP_PAC_PC(caller_pc), tag, MemoryWrite); } } // extern "C" diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface.cpp b/compiler-rt/lib/tsan/rtl/tsan_interface.cpp index 55f1c9834f70..9bd0e8580b17 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interface.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_interface.cpp @@ -40,13 +40,13 @@ void __tsan_write16(void *addr) { } void __tsan_read16_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); - MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr + 8, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr + 8, kSizeLog8); } void __tsan_write16_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); - MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr + 8, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr + 8, kSizeLog8); } // __tsan_unaligned_read/write calls are emitted by compiler. diff --git a/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h b/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h index f5d743c10772..5e77d4d3d288 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h +++ b/compiler-rt/lib/tsan/rtl/tsan_interface_inl.h @@ -51,35 +51,35 @@ void __tsan_write8(void *addr) { } void __tsan_read1_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog1); + MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog1); } void __tsan_read2_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog2); + MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog2); } void __tsan_read4_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog4); + MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog4); } void __tsan_read8_pc(void *addr, void *pc) { - MemoryRead(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); + MemoryRead(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8); } void __tsan_write1_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog1); + MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog1); } void __tsan_write2_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog2); + MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog2); } void __tsan_write4_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog4); + MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog4); } void __tsan_write8_pc(void *addr, void *pc) { - MemoryWrite(cur_thread(), STRIP_PC(pc), (uptr)addr, kSizeLog8); + MemoryWrite(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, kSizeLog8); } void __tsan_vptr_update(void **vptr_p, void *new_val) { @@ -101,7 +101,7 @@ void __tsan_vptr_read(void **vptr_p) { } void __tsan_func_entry(void *pc) { - FuncEntry(cur_thread(), STRIP_PC(pc)); + FuncEntry(cur_thread(), STRIP_PAC_PC(pc)); } void __tsan_func_exit() { @@ -125,9 +125,9 @@ void __tsan_write_range(void *addr, uptr size) { } void __tsan_read_range_pc(void *addr, uptr size, void *pc) { - MemoryAccessRange(cur_thread(), STRIP_PC(pc), (uptr)addr, size, false); + MemoryAccessRange(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, size, false); } void __tsan_write_range_pc(void *addr, uptr size, void *pc) { - MemoryAccessRange(cur_thread(), STRIP_PC(pc), (uptr)addr, size, true); + MemoryAccessRange(cur_thread(), STRIP_PAC_PC(pc), (uptr)addr, size, true); } -- GitLab From c69550c132e5f6eea025ba1f52bd2eb632599d46 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 18 Mar 2021 21:18:07 +0000 Subject: [PATCH 0084/1000] Revert "[mlir] Add linalg.fill bufferization conversion" This reverts commit 32a744ab20f37681f71ca9098625994515f0f4ab. CI is broken: test/Dialect/Linalg/bufferize.mlir:274:12: error: CHECK: expected string not found in input // CHECK: %[[MEMREF:.*]] = tensor_to_memref %[[IN]] : memref ^ --- .../Dialect/Linalg/Transforms/Bufferize.cpp | 31 ++++--------------- mlir/test/Dialect/Linalg/bufferize.mlir | 13 -------- 2 files changed, 6 insertions(+), 38 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp index 419226b35179..32b2ee706d19 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp @@ -32,7 +32,8 @@ static Value cloneMemref(Location loc, Value memref, OpBuilder &b) { } static LogicalResult -allocateBuffersForResults(Location loc, LinalgOp linalgOp, ValueRange outputs, +allocateBuffersForResults(Location loc, LinalgOp linalgOp, + linalg::GenericOpAdaptor &adaptor, SmallVectorImpl &resultBuffers, OpBuilder &b) { // Lazily compute loopRanges. SmallVector loopRanges; @@ -51,7 +52,7 @@ allocateBuffersForResults(Location loc, LinalgOp linalgOp, ValueRange outputs, } auto tensorShape = tensorType.getShape(); auto memrefType = MemRefType::get(tensorShape, tensorType.getElementType()); - Value resultTensor = outputs[resultIndex]; + Value resultTensor = adaptor.outputs()[resultIndex]; // Clone output buffers whose value is actually used. if (linalgOp.payloadUsesValueFromOutputOperandIndex(resultIndex)) { @@ -137,7 +138,8 @@ static void finalizeBufferAllocation(ConversionPatternRewriter &rewriter, namespace { -/// Conversion pattern that replaces `linalg.init_tensor` with allocation. +/// Generic conversion pattern that matches any LinalgOp. This avoids template +/// instantiating one pattern for each LinalgOp. class BufferizeInitTensorOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -153,26 +155,6 @@ public: } }; -/// Conversion pattern that bufferizes `linalg.fill` operation. -class BufferizeFillOp : public OpConversionPattern { -public: - using OpConversionPattern::OpConversionPattern; - - LogicalResult - matchAndRewrite(FillOp op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const final { - linalg::FillOpAdaptor adaptor(operands, op->getAttrDictionary()); - if (!op.output().getType().isa()) - return rewriter.notifyMatchFailure(op, - "operand must be of a tensor type"); - - rewriter.create(op.getLoc(), adaptor.output(), adaptor.value()); - rewriter.replaceOp(op, adaptor.output()); - - return success(); - } -}; - /// Generic conversion pattern that matches any LinalgOp. This avoids template /// instantiating one pattern for each LinalgOp. class BufferizeAnyLinalgOp : public ConversionPattern { @@ -196,7 +178,7 @@ public: Location loc = linalgOp.getLoc(); SmallVector newOutputBuffers; - if (failed(allocateBuffersForResults(loc, linalgOp, adaptor.outputs(), + if (failed(allocateBuffersForResults(loc, linalgOp, adaptor, newOutputBuffers, rewriter))) { linalgOp.emitOpError() << "Failed to allocate buffers for tensor results."; @@ -343,7 +325,6 @@ void mlir::linalg::populateLinalgBufferizePatterns( // TODO: Drop this once tensor constants work in standard. // clang-format off patterns.insert< - BufferizeFillOp, BufferizeInitTensorOp, SubTensorOpConverter, SubTensorInsertOpConverter diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir index 1c7cec1de07a..e0027b765d25 100644 --- a/mlir/test/Dialect/Linalg/bufferize.mlir +++ b/mlir/test/Dialect/Linalg/bufferize.mlir @@ -265,16 +265,3 @@ func @bufferize_subtensor_insert(%t : tensor, %st0 : tensor<2x3xf32>, % return %t0, %t1: tensor, tensor } -// ----- - -// CHECK-LABEL: func @bufferize_fill( -// CHECK-SAME: %[[IN:.*]]: tensor -func @bufferize_fill(%arg0: tensor) -> tensor { - %c0 = constant 0.0 : f32 - // CHECK: %[[MEMREF:.*]] = tensor_to_memref %[[IN]] : memref - // CHECK: linalg.fill(%[[MEMREF]], %cst) : memref, f32 - // CHECK: %[[TENSOR:.*]] = tensor_load %[[MEMREF]] : memref - // CHECK: return %[[TENSOR]] - %0 = linalg.fill(%arg0, %c0) : tensor, f32 -> tensor - return %0 : tensor -} -- GitLab From fcc1ce00931751ac02498986feb37744e9ace8de Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 18 Mar 2021 17:21:16 -0400 Subject: [PATCH 0085/1000] Revert "Revert "[mlir] Add linalg.fill bufferization conversion"" This reverts commit c69550c132e5f6eea025ba1f52bd2eb632599d46 with proper fix applied. --- .../Dialect/Linalg/Transforms/Bufferize.cpp | 31 +++++++++++++++---- mlir/test/Dialect/Linalg/bufferize.mlir | 13 ++++++++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp index 32b2ee706d19..419226b35179 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp @@ -32,8 +32,7 @@ static Value cloneMemref(Location loc, Value memref, OpBuilder &b) { } static LogicalResult -allocateBuffersForResults(Location loc, LinalgOp linalgOp, - linalg::GenericOpAdaptor &adaptor, +allocateBuffersForResults(Location loc, LinalgOp linalgOp, ValueRange outputs, SmallVectorImpl &resultBuffers, OpBuilder &b) { // Lazily compute loopRanges. SmallVector loopRanges; @@ -52,7 +51,7 @@ allocateBuffersForResults(Location loc, LinalgOp linalgOp, } auto tensorShape = tensorType.getShape(); auto memrefType = MemRefType::get(tensorShape, tensorType.getElementType()); - Value resultTensor = adaptor.outputs()[resultIndex]; + Value resultTensor = outputs[resultIndex]; // Clone output buffers whose value is actually used. if (linalgOp.payloadUsesValueFromOutputOperandIndex(resultIndex)) { @@ -138,8 +137,7 @@ static void finalizeBufferAllocation(ConversionPatternRewriter &rewriter, namespace { -/// Generic conversion pattern that matches any LinalgOp. This avoids template -/// instantiating one pattern for each LinalgOp. +/// Conversion pattern that replaces `linalg.init_tensor` with allocation. class BufferizeInitTensorOp : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -155,6 +153,26 @@ public: } }; +/// Conversion pattern that bufferizes `linalg.fill` operation. +class BufferizeFillOp : public OpConversionPattern { +public: + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(FillOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const final { + linalg::FillOpAdaptor adaptor(operands, op->getAttrDictionary()); + if (!op.output().getType().isa()) + return rewriter.notifyMatchFailure(op, + "operand must be of a tensor type"); + + rewriter.create(op.getLoc(), adaptor.output(), adaptor.value()); + rewriter.replaceOp(op, adaptor.output()); + + return success(); + } +}; + /// Generic conversion pattern that matches any LinalgOp. This avoids template /// instantiating one pattern for each LinalgOp. class BufferizeAnyLinalgOp : public ConversionPattern { @@ -178,7 +196,7 @@ public: Location loc = linalgOp.getLoc(); SmallVector newOutputBuffers; - if (failed(allocateBuffersForResults(loc, linalgOp, adaptor, + if (failed(allocateBuffersForResults(loc, linalgOp, adaptor.outputs(), newOutputBuffers, rewriter))) { linalgOp.emitOpError() << "Failed to allocate buffers for tensor results."; @@ -325,6 +343,7 @@ void mlir::linalg::populateLinalgBufferizePatterns( // TODO: Drop this once tensor constants work in standard. // clang-format off patterns.insert< + BufferizeFillOp, BufferizeInitTensorOp, SubTensorOpConverter, SubTensorInsertOpConverter diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir index e0027b765d25..b9a4362f5e34 100644 --- a/mlir/test/Dialect/Linalg/bufferize.mlir +++ b/mlir/test/Dialect/Linalg/bufferize.mlir @@ -265,3 +265,16 @@ func @bufferize_subtensor_insert(%t : tensor, %st0 : tensor<2x3xf32>, % return %t0, %t1: tensor, tensor } +// ----- + +// CHECK-LABEL: func @bufferize_fill( +// CHECK-SAME: %[[IN:.*]]: tensor +func @bufferize_fill(%arg0: tensor) -> tensor { + %c0 = constant 0.0 : f32 + // CHECK: %[[MEMREF:.*]] = memref.buffer_cast %[[IN]] : memref + // CHECK: linalg.fill(%[[MEMREF]], %cst) : memref, f32 + // CHECK: %[[TENSOR:.*]] = memref.tensor_load %[[MEMREF]] : memref + // CHECK: return %[[TENSOR]] + %0 = linalg.fill(%arg0, %c0) : tensor, f32 -> tensor + return %0 : tensor +} -- GitLab From edd6da10d20f8fc025af2131f127c53401def04e Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Mon, 15 Mar 2021 15:08:53 -0700 Subject: [PATCH 0086/1000] [AMDGPU] Remove cpol, tfe, and swz from MUBUF patterns These are always selected as 0 anyway. Differential Revision: https://reviews.llvm.org/D98663 --- llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 8 --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 47 +++----------- llvm/lib/Target/AMDGPU/BUFInstructions.td | 63 +++++++++---------- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 + .../CodeGen/AMDGPU/extract_vector_elt-i16.ll | 17 ++--- 5 files changed, 48 insertions(+), 89 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 1a5f7aafbb43..29f9c20dc8fd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -113,14 +113,6 @@ def gi_mubuf_offset : GIComplexOperandMatcher, GIComplexPatternEquiv; -def gi_mubuf_addr64_atomic : - GIComplexOperandMatcher, - GIComplexPatternEquiv; - -def gi_mubuf_offset_atomic : - GIComplexOperandMatcher, - GIComplexPatternEquiv; - def gi_smrd_buffer_imm : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 28a21a1270ff..fdb1cf898be3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -188,11 +188,7 @@ private: SDValue &Offset1, unsigned Size) const; bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, - SDValue &Idxen, SDValue &Addr64, SDValue &CPol, SDValue &TFE, - SDValue &SWZ) const; - bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, - SDValue &SOffset, SDValue &Offset, SDValue &CPol, - SDValue &TFE, SDValue &SWZ) const; + SDValue &Idxen, SDValue &Addr64) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset) const; bool SelectMUBUFScratchOffen(SDNode *Parent, @@ -202,9 +198,6 @@ private: SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; - bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, - SDValue &Offset, SDValue &CPol, SDValue &TFE, - SDValue &SWZ) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset) const; @@ -1390,8 +1383,7 @@ bool AMDGPUDAGToDAGISel::SelectDSReadWrite2(SDValue Addr, SDValue &Base, bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, - SDValue &Addr64, SDValue &CPol, - SDValue &TFE, SDValue &SWZ) const { + SDValue &Addr64) const { // Subtarget prefers to use flat instruction // FIXME: This should be a pattern predicate and not reach here if (Subtarget->useFlatForGlobal()) @@ -1399,11 +1391,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, SDLoc DL(Addr); - if (!CPol) - CPol = CurDAG->getTargetConstant(0, DL, MVT::i32); - TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); - SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); - Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1480,8 +1467,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr, bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, - SDValue &Offset, SDValue &CPol, - SDValue &TFE, SDValue &SWZ) const { + SDValue &Offset) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1489,8 +1475,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, if (!Subtarget->hasAddr64()) return false; - if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - CPol, TFE, SWZ)) + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) return false; ConstantSDNode *C = cast(Addr64); @@ -1507,14 +1492,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return false; } -bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, - SDValue &VAddr, SDValue &SOffset, - SDValue &Offset) const { - SDValue CPol, TFE, SWZ; - - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, CPol, TFE, SWZ); -} - static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { auto PSV = PtrInfo.V.dyn_cast(); return PSV && PSV->isStack(); @@ -1633,15 +1610,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &SOffset, SDValue &Offset, - SDValue &CPol, SDValue &TFE, - SDValue &SWZ) const { + SDValue &SOffset, SDValue &Offset + ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); - if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - CPol, TFE, SWZ)) + if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64)) return false; if (!cast(Offen)->getSExtValue() && @@ -1660,14 +1635,6 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, return false; } -bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, - SDValue &Soffset, SDValue &Offset - ) const { - SDValue CPol, TFE, SWZ; - - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, CPol, TFE, SWZ); -} - // Find a load or store from corresponding pattern root. // Roots may be build_vector, bitconvert or their combinations. static MemSDNode* findMemSDNode(SDNode *N) { diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td index d367969702e3..6a3e823e4ac3 100644 --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -6,15 +6,12 @@ // //===----------------------------------------------------------------------===// -def MUBUFAddr64 : ComplexPattern; -def MUBUFAddr64Atomic : ComplexPattern; +def MUBUFAddr64 : ComplexPattern; +def MUBUFOffset : ComplexPattern; def MUBUFScratchOffen : ComplexPattern; def MUBUFScratchOffset : ComplexPattern; -def MUBUFOffset : ComplexPattern; -def MUBUFOffsetAtomic : ComplexPattern; - def BUFAddrKind { int Offset = 0; int OffEn = 1; @@ -402,19 +399,19 @@ class getMUBUFInsDA vdataList, RegisterOperand vdata_op = getLdStRegisterOperand.ret; dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, CPol:$cpol), + offset:$offset, CPol_0:$cpol), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, CPol:$cpol) + offset:$offset, CPol_0:$cpol) ); dag InsData = !if(!empty(vaddrList), (ins vdata_op:$vdata, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, CPol:$cpol), + SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol), (ins vdata_op:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, - SCSrc_b32:$soffset, offset:$offset, CPol:$cpol) + SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol) ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins SWZ:$swz), (ins TFE:$tfe, SWZ:$swz)) + !if(isLds, (ins SWZ_0:$swz), (ins TFE_0:$tfe, SWZ_0:$swz)) ); } @@ -506,15 +503,15 @@ class MUBUF_Load_Pseudo : Pat < - (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, CPol:$cpol, i1:$tfe, i1:$swz))), - (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, CPol:$cpol, i1:$tfe, i1:$swz)) + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset)) >; class MUBUF_Addr64_Load_Pat : Pat < - (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, CPol:$cpol, i1:$tfe, i1:$swz))), - (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, CPol:$cpol, i1:$tfe, i1:$swz)) + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset)) >; multiclass MUBUF_Pseudo_Load_Pats { @@ -585,12 +582,12 @@ multiclass MUBUF_Pseudo_Stores, + i16:$offset))]>, MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo , + i16:$offset))]>, MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo ; @@ -757,14 +754,14 @@ multiclass MUBUF_Pseudo_Atomics_RTN , MUBUFAddr64Table <0, NAME # "_RTN">; let FPAtomic = isFP in def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo , MUBUFAddr64Table <1, NAME # "_RTN">; @@ -1539,20 +1536,20 @@ def : GCNPat< class MUBUFLoad_PatternADDR64 : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, CPol:$cpol, i1:$tfe, i1:$swz))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, CPol:$cpol, $tfe, $swz) + i16:$offset))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset) >; multiclass MUBUFLoad_Atomic_Pattern { def : GCNPat < (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, 0, 0) + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset) >; def : GCNPat < - (vt (atomic_ld (MUBUFOffsetAtomic v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) + (vt (atomic_ld (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset))), + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset)) >; } @@ -1572,9 +1569,8 @@ multiclass MUBUFLoad_Pattern { def : GCNPat < - (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, CPol:$cpol, i1:$tfe, i1:$swz))), - (Instr_OFFSET $srsrc, $soffset, $offset, CPol:$cpol, $tfe, $swz) + (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))), + (Instr_OFFSET $srsrc, $soffset, $offset) >; } @@ -1612,12 +1608,12 @@ multiclass MUBUFScratchLoadPat_D16 { def : GCNPat < (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, $in) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, $in) >; def : GCNPat < (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, $in) + (InstrOffset $srsrc, $soffset, $offset, $in) >; } @@ -1663,12 +1659,12 @@ multiclass MUBUFStore_Atomic_Pattern ; def : GCNPat < - (atomic_st (MUBUFOffsetAtomic v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0) + (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset)) >; } let SubtargetPredicate = isGFX6GFX7 in { @@ -1681,9 +1677,8 @@ multiclass MUBUFStore_Pattern { def : GCNPat < - (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, - i16:$offset, CPol:$cpol, i1:$tfe, i1:$swz)), - (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, CPol:$cpol, $tfe, $swz) + (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset)), + (Instr_OFFSET $vdata, $srsrc, $soffset, $offset) >; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index af51434514df..19ccb1e28088 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1116,7 +1116,9 @@ def CPol_0 : NamedOperandU32Default0<"CPol", NamedMatchClass<"CPol">>; def CPol_GLC1 : NamedOperandU32Default1<"CPol", NamedMatchClass<"CPol">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def TFE_0 : NamedOperandBit_0<"TFE", NamedMatchClass<"TFE">>; def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>; +def SWZ_0 : NamedOperandBit_0<"SWZ", NamedMatchClass<"SWZ">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index b938ff52f21a..729a05c12c74 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -1,14 +1,17 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s +; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89,SIVI %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s ; GCN-LABEL: {{^}}extract_vector_elt_v2i16: ; GCN: s_load_dword [[VEC:s[0-9]+]] -; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 -; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] -; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] -; GCN-DAG: buffer_store_short [[VELT0]] -; GCN-DAG: buffer_store_short [[VELT1]] +; SIVI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 +; SIVI-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]] +; SIVI-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]] +; SIVI-DAG: buffer_store_short [[VELT0]] +; SIVI-DAG: buffer_store_short [[VELT1]] +; GFX9: v_mov_b32_e32 [[VVEC:v[0-9]+]], [[VEC]] +; GFX9: global_store_short_d16_hi v{{[0-9]+}}, [[VVEC]], +; GFX9: buffer_store_short [[VVEC]], define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr %p0 = extractelement <2 x i16> %vec, i32 0 -- GitLab From 44f24f3996e8a32d0bb3d6d79a66643c36f088da Mon Sep 17 00:00:00 2001 From: thomasraoux Date: Thu, 18 Mar 2021 14:57:19 -0700 Subject: [PATCH 0087/1000] [mlir] Fix build failure due to 1a572f4 --- mlir/lib/Conversion/GPUCommon/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt index 825bed600aba..d9f6867556c6 100644 --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -37,4 +37,5 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms MLIRPass MLIRSupport MLIRStandardToLLVM + MLIRVectorToLLVM ) -- GitLab From e1579894d2051db8144f484135208c778c7055e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= Date: Thu, 18 Mar 2021 17:50:48 +0100 Subject: [PATCH 0088/1000] [lli] Add Orc greedy mode as -jit-kind=orc In the existing OrcLazy mode, modules go through partitioning and outgoing calls are replaced by reexport stubs that resolve on call-through. In greedy mode that this patch unlocks for lli, modules materialize as a whole and trigger materialization for all required symbols recursively. This is useful for testing (e.g. D98785) and it's more similar to the way MCJIT works. --- llvm/tools/lli/lli.cpp | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp index 66b2c13c426f..32df0711f2fd 100644 --- a/llvm/tools/lli/lli.cpp +++ b/llvm/tools/lli/lli.cpp @@ -84,7 +84,7 @@ static codegen::RegisterCodeGenFlags CGF; namespace { - enum class JITKind { MCJIT, OrcLazy }; + enum class JITKind { MCJIT, Orc, OrcLazy }; enum class JITLinkerKind { Default, RuntimeDyld, JITLink }; cl::opt @@ -101,6 +101,7 @@ namespace { "jit-kind", cl::desc("Choose underlying JIT kind."), cl::init(JITKind::MCJIT), cl::values(clEnumValN(JITKind::MCJIT, "mcjit", "MCJIT"), + clEnumValN(JITKind::Orc, "orc", "Orc JIT"), clEnumValN(JITKind::OrcLazy, "orc-lazy", "Orc-based lazy JIT."))); @@ -416,7 +417,7 @@ static void reportError(SMDiagnostic Err, const char *ProgName) { } Error loadDylibs(); -int runOrcLazyJIT(const char *ProgName); +int runOrcJIT(const char *ProgName); void disallowOrcOptions(); //===----------------------------------------------------------------------===// @@ -443,11 +444,12 @@ int main(int argc, char **argv, char * const *envp) { ExitOnErr(loadDylibs()); - if (UseJITKind == JITKind::OrcLazy) - return runOrcLazyJIT(argv[0]); - else + if (UseJITKind == JITKind::MCJIT) disallowOrcOptions(); + else + return runOrcJIT(argv[0]); + // Old lli implementation based on ExecutionEngine and MCJIT. LLVMContext Context; // Load the bitcode... @@ -829,7 +831,7 @@ loadModule(StringRef Path, orc::ThreadSafeContext TSCtx) { return orc::ThreadSafeModule(std::move(M), std::move(TSCtx)); } -int runOrcLazyJIT(const char *ProgName) { +int runOrcJIT(const char *ProgName) { // Start setting up the JIT environment. // Parse the main module. @@ -975,8 +977,17 @@ int runOrcLazyJIT(const char *ProgName) { std::make_unique(GenerateBuiltinFunctions, Mangle)); + // Regular modules are greedy: They materialize as a whole and trigger + // materialization for all required symbols recursively. Lazy modules go + // through partitioning and they replace outgoing calls with reexport stubs + // that resolve on call-through. + auto AddModule = [&](orc::JITDylib &JD, orc::ThreadSafeModule M) { + return UseJITKind == JITKind::OrcLazy ? J->addLazyIRModule(JD, std::move(M)) + : J->addIRModule(JD, std::move(M)); + }; + // Add the main module. - ExitOnErr(J->addLazyIRModule(std::move(MainModule))); + ExitOnErr(AddModule(J->getMainJITDylib(), std::move(MainModule))); // Create JITDylibs and add any extra modules. { @@ -1004,7 +1015,7 @@ int runOrcLazyJIT(const char *ProgName) { assert(EMIdx != 0 && "ExtraModule should have index > 0"); auto JDItr = std::prev(IdxToDylib.lower_bound(EMIdx)); auto &JD = *JDItr->second; - ExitOnErr(J->addLazyIRModule(JD, std::move(M))); + ExitOnErr(AddModule(JD, std::move(M))); } for (auto EAItr = ExtraArchives.begin(), EAEnd = ExtraArchives.end(); -- GitLab From 305a0bad1d5509d2f79123a73d06fff848b9bf88 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 18 Mar 2021 13:19:42 -0700 Subject: [PATCH 0089/1000] [SelectionDAG] Don't pass a scalable vector to MachinePointerInfo::getWithOffset in a unit test. Suppresses an implicit TypeSize to uint64_t conversion warning. We might be able to just not offset it since we're writing to a Fixed stack object, but I wasn't sure so I just did what DAGTypeLegalizer::IncrementPointer does. Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D98736 --- .../unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp index c00b6c518e70..626176d4ba9a 100644 --- a/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp +++ b/llvm/unittests/CodeGen/SelectionDAGAddressAnalysisTest.cpp @@ -165,14 +165,12 @@ TEST_F(SelectionDAGAddressAnalysisTest, unknownSizeFrameObjects) { int FI = cast(FIPtr.getNode())->getIndex(); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FI); SDValue Value = DAG->getConstant(0, Loc, SubVecVT); - TypeSize Offset0 = TypeSize::Fixed(0); TypeSize Offset1 = SubVecVT.getStoreSize(); - SDValue Index0 = DAG->getMemBasePlusOffset(FIPtr, Offset0, Loc); SDValue Index1 = DAG->getMemBasePlusOffset(FIPtr, Offset1, Loc); - SDValue Store0 = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index0, - PtrInfo.getWithOffset(Offset0)); + SDValue Store0 = + DAG->getStore(DAG->getEntryNode(), Loc, Value, FIPtr, PtrInfo); SDValue Store1 = DAG->getStore(DAG->getEntryNode(), Loc, Value, Index1, - PtrInfo.getWithOffset(Offset1)); + MachinePointerInfo(PtrInfo.getAddrSpace())); Optional NumBytes0 = MemoryLocation::getSizeOrUnknown( cast(Store0)->getMemoryVT().getStoreSize()); Optional NumBytes1 = MemoryLocation::getSizeOrUnknown( -- GitLab From 2df65f87c1ea81008768e14522e5d9277234ba70 Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 18 Mar 2021 18:25:21 -0400 Subject: [PATCH 0090/1000] [OpenMP] Fixed a crash in hidden helper thread It is reported that after enabling hidden helper thread, the program can hit the assertion `new_gtid < __kmp_threads_capacity` sometimes. The root cause is explained as follows. Let's say the default `__kmp_threads_capacity` is `N`. If hidden helper thread is enabled, `__kmp_threads_capacity` will be offset to `N+8` by default. If the number of threads we need exceeds `N+8`, e.g. via `num_threads` clause, we need to expand `__kmp_threads`. In `__kmp_expand_threads`, the expansion starts from `__kmp_threads_capacity`, and repeatedly doubling it until the new capacity meets the requirement. Let's assume the new requirement is `Y`. If `Y` happens to meet the constraint `(N+8)*2^X=Y` where `X` is the number of iterations, the new capacity is not enough because we have 8 slots for hidden helper threads. Here is an example. ``` #include int main(int argc, char *argv[]) { constexpr const size_t N = 1344; std::vector data(N); #pragma omp parallel for for (unsigned i = 0; i < N; ++i) { data[i] = i; } #pragma omp parallel for num_threads(N) for (unsigned i = 0; i < N; ++i) { data[i] += i; } return 0; } ``` My CPU is 20C40T, then `__kmp_threads_capacity` is 160. After offset, `__kmp_threads_capacity` becomes 168. `1344 = (160+8)*2^3`, then the assertions hit. Reviewed By: protze.joachim Differential Revision: https://reviews.llvm.org/D98838 --- openmp/runtime/src/kmp_runtime.cpp | 15 ++++++- openmp/runtime/src/kmp_settings.cpp | 7 +-- .../capacity_mix_threads.cpp | 45 +++++++++++++++++++ .../hidden_helper_task/capacity_nthreads.cpp | 31 +++++++++++++ 4 files changed, 94 insertions(+), 4 deletions(-) create mode 100644 openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp create mode 100644 openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp index 8f42a9d3fe0c..8ebbd0337d55 100644 --- a/openmp/runtime/src/kmp_runtime.cpp +++ b/openmp/runtime/src/kmp_runtime.cpp @@ -854,6 +854,12 @@ static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, if (TCR_PTR(__kmp_threads[0]) == NULL) { --capacity; } + // If it is not for initializing the hidden helper team, we need to take + // __kmp_hidden_helper_threads_num out of the capacity because it is included + // in __kmp_threads_capacity. + if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { + capacity -= __kmp_hidden_helper_threads_num; + } if (__kmp_nth + new_nthreads - (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > capacity) { @@ -3607,6 +3613,13 @@ int __kmp_register_root(int initial_thread) { --capacity; } + // If it is not for initializing the hidden helper team, we need to take + // __kmp_hidden_helper_threads_num out of the capacity because it is included + // in __kmp_threads_capacity. + if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { + capacity -= __kmp_hidden_helper_threads_num; + } + /* see if there are too many threads */ if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { if (__kmp_tp_cached) { @@ -3639,7 +3652,7 @@ int __kmp_register_root(int initial_thread) { /* find an available thread slot */ // Don't reassign the zero slot since we need that to only be used by // initial thread. Slots for hidden helper threads should also be skipped. - if (initial_thread && __kmp_threads[0] == NULL) { + if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { gtid = 0; } else { for (gtid = __kmp_hidden_helper_threads_num + 1; diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp index 35c15ee2a2e6..dd233484cbc6 100644 --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -504,9 +504,10 @@ int __kmp_initial_threads_capacity(int req_nproc) { nth = (4 * __kmp_xproc); // If hidden helper task is enabled, we initialize the thread capacity with - // extra - // __kmp_hidden_helper_threads_num. - nth += __kmp_hidden_helper_threads_num; + // extra __kmp_hidden_helper_threads_num. + if (__kmp_enable_hidden_helper) { + nth += __kmp_hidden_helper_threads_num; + } if (nth > __kmp_max_nth) nth = __kmp_max_nth; diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp new file mode 100644 index 000000000000..776aee9d8e2c --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_mix_threads.cpp @@ -0,0 +1,45 @@ +// RUN: %libomp-cxx-compile-and-run + +#include + +#include +#include +#include +#include +#include + +void dummy_root() { + // omp_get_max_threads() will do middle initialization + int nthreads = omp_get_max_threads(); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); +} + +int main(int argc, char *argv[]) { + const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()), + 4 * omp_get_num_procs()), + std::numeric_limits::max()); + + std::vector data(N); + + // Create a new thread to initialize the OpenMP RTL. The new thread will not + // be taken as the "initial thread". + std::thread root(dummy_root); + +#pragma omp parallel for num_threads(N) + for (unsigned i = 0; i < N; ++i) { + data[i] = i; + } + +#pragma omp parallel for num_threads(N + 1) + for (unsigned i = 0; i < N; ++i) { + data[i] += i; + } + + for (unsigned i = 0; i < N; ++i) { + assert(data[i] == 2 * i); + } + + root.join(); + + return 0; +} diff --git a/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp new file mode 100644 index 000000000000..a9d394f729e9 --- /dev/null +++ b/openmp/runtime/test/tasking/hidden_helper_task/capacity_nthreads.cpp @@ -0,0 +1,31 @@ +// RUN: %libomp-cxx-compile-and-run + +#include + +#include +#include +#include + +int main(int argc, char *argv[]) { + const int N = std::min(std::max(std::max(32, 4 * omp_get_max_threads()), + 4 * omp_get_num_procs()), + std::numeric_limits::max()); + + std::vector data(N); + +#pragma omp parallel for num_threads(N) + for (unsigned i = 0; i < N; ++i) { + data[i] = i; + } + +#pragma omp parallel for num_threads(N + 1) + for (unsigned i = 0; i < N; ++i) { + data[i] += i; + } + + for (unsigned i = 0; i < N; ++i) { + assert(data[i] == 2 * i); + } + + return 0; +} -- GitLab From b4a8c0ebb6d49f757c687833d85f843aaeb19133 Mon Sep 17 00:00:00 2001 From: Yuanfang Chen Date: Thu, 18 Mar 2021 15:32:29 -0700 Subject: [PATCH 0091/1000] [LTO][MC] Discard non-prevailing defined symbols in module-level assembly This is the alternative approach to D96931. In LTO, for each module with inlineasm block, prepend directive ".lto_discard , *" to the beginning of the inline asm. ".lto_discard" is both a module inlineasm block marker and (optionally) provides a list of symbols to be discarded. In MC while emitting for inlineasm, discard symbol binding & symbol definitions according to ".lto_disard". Reviewed By: MaskRay Differential Revision: https://reviews.llvm.org/D98762 --- llvm/include/llvm/MC/MCContext.h | 1 - llvm/include/llvm/MC/MCParser/MCAsmParser.h | 2 + llvm/lib/LTO/LTO.cpp | 30 ++++++- llvm/lib/MC/MCParser/AsmParser.cpp | 48 ++++++++++- llvm/lib/MC/MCParser/ELFAsmParser.cpp | 6 ++ llvm/test/LTO/X86/inline-asm-lto-discard.ll | 87 ++++++++++++++++++++ llvm/test/LTO/X86/inline-asm-lto-discard2.ll | 29 +++++++ llvm/test/MC/ELF/lto-discard.s | 31 +++++++ 8 files changed, 231 insertions(+), 3 deletions(-) create mode 100644 llvm/test/LTO/X86/inline-asm-lto-discard.ll create mode 100644 llvm/test/LTO/X86/inline-asm-lto-discard2.ll create mode 100644 llvm/test/MC/ELF/lto-discard.s diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h index 106763c5d7c2..f07e5a89b101 100644 --- a/llvm/include/llvm/MC/MCContext.h +++ b/llvm/include/llvm/MC/MCContext.h @@ -396,7 +396,6 @@ namespace llvm { void initInlineSourceManager(); SourceMgr *getInlineSourceManager() { - assert(InlineSrcMgr); return InlineSrcMgr.get(); } std::vector &getLocInfos() { return LocInfos; } diff --git a/llvm/include/llvm/MC/MCParser/MCAsmParser.h b/llvm/include/llvm/MC/MCParser/MCAsmParser.h index 02cc22009196..24d4ada5fa0b 100644 --- a/llvm/include/llvm/MC/MCParser/MCAsmParser.h +++ b/llvm/include/llvm/MC/MCParser/MCAsmParser.h @@ -182,6 +182,8 @@ public: virtual void setParsingMSInlineAsm(bool V) = 0; virtual bool isParsingMSInlineAsm() = 0; + virtual bool discardLTOSymbol(StringRef) const { return false; } + virtual bool isParsingMasm() const { return false; } virtual bool defineMacro(StringRef Name, StringRef Value) { return true; } diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 8bcb1600925d..3cd8c78c42e6 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -11,7 +11,9 @@ //===----------------------------------------------------------------------===// #include "llvm/LTO/LTO.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/StackSafetyAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" @@ -752,6 +754,7 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, Skip(); std::set NonPrevailingComdats; + SmallSet NonPrevailingAsmSymbols; for (const InputFile::Symbol &Sym : Syms) { assert(ResI != ResE); SymbolResolution Res = *ResI++; @@ -798,7 +801,14 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, GV->setDLLStorageClass(GlobalValue::DLLStorageClassTypes:: DefaultStorageClass); } + } else if (auto *AS = Msym.dyn_cast()) { + // Collect non-prevailing symbols. + if (!Res.Prevailing) + NonPrevailingAsmSymbols.insert(AS->first); + } else { + llvm_unreachable("unknown symbol type"); } + // Common resolution: collect the maximum size/alignment over all commons. // We also record if we see an instance of a common as prevailing, so that // if none is prevailing we can ignore it later. @@ -812,11 +822,29 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef Syms, CommonRes.Align = max(*SymAlign, CommonRes.Align); CommonRes.Prevailing |= Res.Prevailing; } - } + if (!M.getComdatSymbolTable().empty()) for (GlobalValue &GV : M.global_values()) handleNonPrevailingComdat(GV, NonPrevailingComdats); + + // Prepend ".lto_discard , *" directive to each module inline asm + // block. + if (!M.getModuleInlineAsm().empty()) { + std::string NewIA = ".lto_discard"; + if (!NonPrevailingAsmSymbols.empty()) { + // Don't dicard a symbol if there is a live .symver for it. + ModuleSymbolTable::CollectAsmSymvers( + M, [&](StringRef Name, StringRef Alias) { + if (!NonPrevailingAsmSymbols.count(Alias)) + NonPrevailingAsmSymbols.erase(Name); + }); + NewIA += " " + llvm::join(NonPrevailingAsmSymbols, ", "); + } + NewIA += "\n"; + M.setModuleInlineAsm(NewIA + M.getModuleInlineAsm()); + } + assert(MsymI == MsymE); return std::move(Mod); } diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp index 3ef51e69ab7e..261d1e9394eb 100644 --- a/llvm/lib/MC/MCParser/AsmParser.cpp +++ b/llvm/lib/MC/MCParser/AsmParser.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/None.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" @@ -168,6 +169,8 @@ private: /// List of forward directional labels for diagnosis at the end. SmallVector, 4> DirLabels; + SmallSet LTODiscardSymbols; + /// AssemblerDialect. ~OU means unset value and use value provided by MAI. unsigned AssemblerDialect = ~0U; @@ -235,6 +238,10 @@ public: } bool isParsingMSInlineAsm() override { return ParsingMSInlineAsm; } + bool discardLTOSymbol(StringRef Name) const override { + return LTODiscardSymbols.contains(Name); + } + bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString, unsigned &NumOutputs, unsigned &NumInputs, SmallVectorImpl> &OpDecls, @@ -516,6 +523,7 @@ private: DK_ADDRSIG, DK_ADDRSIG_SYM, DK_PSEUDO_PROBE, + DK_LTO_DISCARD, DK_END }; @@ -682,6 +690,9 @@ private: // .pseudoprobe bool parseDirectivePseudoProbe(); + // ".lto_discard" + bool parseDirectiveLTODiscard(); + // Directives to support address-significance tables. bool parseDirectiveAddrsig(); bool parseDirectiveAddrsigSym(); @@ -892,6 +903,8 @@ bool AsmParser::enabledGenDwarfForAssembly() { } bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) { + LTODiscardSymbols.clear(); + // Create the initial section, if requested. if (!NoInitialTextSection) Out.InitSections(false); @@ -1770,7 +1783,6 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, StringMap::const_iterator DirKindIt = DirectiveKindMap.find(IDVal.lower()); DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end()) - ? DK_NO_DIRECTIVE : DirKindIt->getValue(); switch (DirKind) { @@ -1868,6 +1880,9 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, Lex(); } + if (discardLTOSymbol(IDVal)) + return false; + getTargetParser().doBeforeLabelEmit(Sym); // Emit the label. @@ -2208,6 +2223,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info, return parseDirectiveAddrsigSym(); case DK_PSEUDO_PROBE: return parseDirectivePseudoProbe(); + case DK_LTO_DISCARD: + return parseDirectiveLTODiscard(); } return Error(IDLoc, "unknown directive"); @@ -2852,6 +2869,9 @@ bool AsmParser::parseAssignment(StringRef Name, bool allow_redef, return false; } + if (discardLTOSymbol(Name)) + return false; + // Do the assignment. Out.emitAssignment(Sym, Value); if (NoDeadStrip) @@ -4870,6 +4890,10 @@ bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) { SMLoc Loc = getTok().getLoc(); if (parseIdentifier(Name)) return Error(Loc, "expected identifier"); + + if (discardLTOSymbol(Name)) + return false; + MCSymbol *Sym = getContext().getOrCreateSymbol(Name); // Assembler local symbols don't make any sense here. Complain loudly. @@ -5493,6 +5517,7 @@ void AsmParser::initializeDirectiveKindMap() { DirectiveKindMap[".addrsig"] = DK_ADDRSIG; DirectiveKindMap[".addrsig_sym"] = DK_ADDRSIG_SYM; DirectiveKindMap[".pseudoprobe"] = DK_PSEUDO_PROBE; + DirectiveKindMap[".lto_discard"] = DK_LTO_DISCARD; } MCAsmMacro *AsmParser::parseMacroLikeBody(SMLoc DirectiveLoc) { @@ -5806,6 +5831,27 @@ bool AsmParser::parseDirectivePseudoProbe() { return false; } +/// parseDirectiveLTODiscard +/// ::= ".lto_discard" [ identifier ( , identifier )* ] +/// The LTO library emits this directive to discard non-prevailing symbols. +/// We ignore symbol assignments and attribute changes for the specified +/// symbols. +bool AsmParser::parseDirectiveLTODiscard() { + auto ParseOp = [&]() -> bool { + StringRef Name; + SMLoc Loc = getTok().getLoc(); + if (parseIdentifier(Name)) + return Error(Loc, "expected identifier"); + LTODiscardSymbols.insert(Name); + return false; + }; + + LTODiscardSymbols.clear(); + if (parseMany(ParseOp)) + return addErrorSuffix(" in directive"); + return false; +} + // We are comparing pointers, but the pointers are relative to a single string. // Thus, this should always be deterministic. static int rewritesSort(const AsmRewrite *AsmRewriteA, diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp index 5b3f0225bba9..70d69fc8dd32 100644 --- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp +++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp @@ -182,6 +182,12 @@ bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) { if (getParser().parseIdentifier(Name)) return TokError("expected identifier in directive"); + if (getParser().discardLTOSymbol(Name)) { + if (getLexer().is(AsmToken::EndOfStatement)) + break; + continue; + } + MCSymbol *Sym = getContext().getOrCreateSymbol(Name); getStreamer().emitSymbolAttribute(Sym, Attr); diff --git a/llvm/test/LTO/X86/inline-asm-lto-discard.ll b/llvm/test/LTO/X86/inline-asm-lto-discard.ll new file mode 100644 index 000000000000..4893eb186cfb --- /dev/null +++ b/llvm/test/LTO/X86/inline-asm-lto-discard.ll @@ -0,0 +1,87 @@ +; Check that non-prevailing symbols in module inline assembly are discarded +; during regular LTO otherwise the final symbol binding could be wrong. + +; RUN: split-file %s %t +; RUN: opt %t/t1.ll -o %t1 +; RUN: opt %t/t2.ll -o %t2 +; RUN: opt %t/t3.ll -o %t3 +; RUN: opt %t/t4.ll -o %t4 + +; RUN: llvm-lto2 run -o %to1 -save-temps %t1 %t2 \ +; RUN: -r %t1,foo,px \ +; RUN: -r %t2,foo, \ +; RUN: -r %t2,bar,pl +; RUN: llvm-dis < %to1.0.0.preopt.bc | FileCheck %s --check-prefix=ASM1 +; RUN: llvm-nm %to1.0 | FileCheck %s --check-prefix=SYM +; RUN: llvm-objdump -d --disassemble-symbols=foo %to1.0 \ +; RUN: | FileCheck %s --check-prefix=DEF + +; RUN: llvm-lto2 run -o %to2 -save-temps %t2 %t3 \ +; RUN: -r %t2,foo, \ +; RUN: -r %t2,bar,pl \ +; RUN: -r %t3,foo,px +; RUN: llvm-dis < %to2.0.0.preopt.bc | FileCheck %s --check-prefix=ASM2 +; RUN: llvm-nm %to2.0 | FileCheck %s --check-prefix=SYM +; RUN: llvm-objdump -d --disassemble-symbols=foo %to2.0 \ +; RUN: | FileCheck %s --check-prefix=DEF + +; Check that ".symver" is properly handled. +; RUN: llvm-lto2 run -o %to3 -save-temps %t4 \ +; RUN: -r %t4,bar, \ +; RUN: -r %t4,foo, \ +; RUN: -r %t4,foo@@VER1,px +; RUN: llvm-dis < %to3.0.0.preopt.bc | FileCheck %s --check-prefix=ASM3 + +; ASM1: module asm ".lto_discard foo" +; ASM1-NEXT: module asm ".weak foo" +; ASM1-NEXT: module asm ".equ foo,bar" + +; ASM2: module asm ".lto_discard foo" +; ASM2-NEXT: module asm ".weak foo" +; ASM2-NEXT: module asm ".equ foo,bar" +; ASM2-NEXT: module asm ".lto_discard" +; ASM2-NEXT: module asm " .global foo ; foo: leal 2(%rdi), %eax" + +; ASM3-NOT: module asm ".lto_discard foo" + +; SYM: T foo + +; DEF: leal 2(%rdi), %eax + +;--- t1.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define dso_local i32 @foo(i32 %0) { + %2 = add nsw i32 %0, 2 + ret i32 %2 +} + +;--- t2.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +module asm ".weak foo" +module asm ".equ foo,bar" + +@llvm.compiler.used = appending global [1 x i8*] [i8* bitcast (i32 (i32)* @bar to i8*)], section "llvm.metadata" + +define internal i32 @bar(i32 %0) { + %2 = add nsw i32 %0, 1 + ret i32 %2 +} + +;--- t3.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +module asm " .global foo ; foo: leal 2(%rdi), %eax" + +;--- t4.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +module asm ".global foo" +module asm "foo: call bar" +module asm ".symver foo,foo@@@VER1" +module asm ".symver bar,bar@@@VER1" diff --git a/llvm/test/LTO/X86/inline-asm-lto-discard2.ll b/llvm/test/LTO/X86/inline-asm-lto-discard2.ll new file mode 100644 index 000000000000..5d111d0a52e3 --- /dev/null +++ b/llvm/test/LTO/X86/inline-asm-lto-discard2.ll @@ -0,0 +1,29 @@ +; Check that +; 1. ".lto_discard" works as module inlineasm marker and its argument symbols +; are discarded. +; 2. there is no reassignment error in the presence of ".lto_discard" +; RUN: llc < %s | FileCheck %s + +; CHECK: .data +; CHECK-NOT: .weak foo +; CHECK-NOT: .set foo, bar +; CHECK: .globl foo +; CHECK: foo: +; CHECK: .byte 1 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +module asm ".lto_discard foo" +module asm " .text" +module asm "bar:" +module asm " .data" +module asm ".weak foo" +module asm ".set foo, bar" +module asm ".weak foo" +module asm ".set foo, bar" + +module asm ".lto_discard" +module asm ".globl foo" +module asm "foo:" +module asm " .byte 1" diff --git a/llvm/test/MC/ELF/lto-discard.s b/llvm/test/MC/ELF/lto-discard.s new file mode 100644 index 000000000000..75a7d7ea5e91 --- /dev/null +++ b/llvm/test/MC/ELF/lto-discard.s @@ -0,0 +1,31 @@ +// Check that ".lto_discard" ignores symbol assignments and attribute changes +// for the specified symbols. +// RUN: llvm-mc -triple x86_64 < %s | FileCheck %s + +// Check that ".lto_discard" only accepts identifiers. +// RUN: not llvm-mc -filetype=obj -triple x86_64 --defsym ERR=1 %s 2>&1 |\ +// RUN: FileCheck %s --check-prefix=ERR + +// CHECK: .weak foo +// CHECK: foo: +// CHECK: .byte 1 +// CHECK: .weak bar +// CHECK: bar: +// CHECK: .byte 2 + +.lto_discard foo +.weak foo +foo: + .byte 1 + +.lto_discard +.weak bar +bar: + .byte 2 + + +.ifdef ERR +.text +# ERR: {{.*}}.s:[[#@LINE+1]]:14: error: expected identifier in directive +.lto_discard 1 +.endif -- GitLab From 182b831aebc0569e8344d848fa20f0c67f43d55a Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 18 Mar 2021 15:29:52 -0700 Subject: [PATCH 0092/1000] [DAGCombiner][RISCV] Teach visitMGATHER/MSCATTER to remove gather/scatters with all zeros masks that use SPLAT_VECTOR. Previously only all zeros BUILD_VECTOR was recognized. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +- llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll | 98 +++++++++++++++++++ .../test/CodeGen/RISCV/rvv/mscatter-sdnode.ll | 84 ++++++++++++++++ 3 files changed, 184 insertions(+), 2 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1c063dae9d88..382fc91285a0 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9618,7 +9618,7 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) { SDLoc DL(N); // Zap scatters with a zero mask. - if (ISD::isBuildVectorAllZeros(Mask.getNode())) + if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) return Chain; if (refineUniformBase(BasePtr, Index, DAG)) { @@ -9674,7 +9674,7 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) { SDLoc DL(N); // Zap gathers with a zero mask. - if (ISD::isBuildVectorAllZeros(Mask.getNode())) + if (ISD::isConstantSplatVectorAllZeros(Mask.getNode())) return CombineTo(N, PassThru, MGT->getChain()); if (refineUniformBase(BasePtr, Index, DAG)) { diff --git a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll index c5f9ea8aa3e3..d567ff9a0140 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mgather-sdnode.ll @@ -210,6 +210,20 @@ define @mgather_truemask_nxv4i8( %ptrs, %v } +define @mgather_falsemask_nxv4i8( %ptrs, %passthru) { +; RV32-LABEL: mgather_falsemask_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_falsemask_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4i8.nxv4p0i8( %ptrs, i32 1, zeroinitializer, %passthru) + ret %v +} + declare @llvm.masked.gather.nxv8i8.nxv8p0i8(, i32, , ) define @mgather_nxv8i8( %ptrs, %m, %passthru) { @@ -417,6 +431,20 @@ define @mgather_truemask_nxv4i16( %ptrs, < ret %v } +define @mgather_falsemask_nxv4i16( %ptrs, %passthru) { +; RV32-LABEL: mgather_falsemask_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_falsemask_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4i16.nxv4p0i16( %ptrs, i32 2, zeroinitializer, %passthru) + ret %v +} + declare @llvm.masked.gather.nxv8i16.nxv8p0i16(, i32, , ) define @mgather_nxv8i16( %ptrs, %m, %passthru) { @@ -661,6 +689,20 @@ define @mgather_truemask_nxv4i32( %ptrs, < ret %v } +define @mgather_falsemask_nxv4i32( %ptrs, %passthru) { +; RV32-LABEL: mgather_falsemask_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_falsemask_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: vmv2r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4i32.nxv4p0i32( %ptrs, i32 4, zeroinitializer, %passthru) + ret %v +} + declare @llvm.masked.gather.nxv8i32.nxv8p0i32(, i32, , ) define @mgather_nxv8i32( %ptrs, %m, %passthru) { @@ -937,6 +979,20 @@ define @mgather_truemask_nxv4i64( %ptrs, < ret %v } +define @mgather_falsemask_nxv4i64( %ptrs, %passthru) { +; RV32-LABEL: mgather_falsemask_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_falsemask_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4i64.nxv4p0i64( %ptrs, i32 8, zeroinitializer, %passthru) + ret %v +} + declare @llvm.masked.gather.nxv8i64.nxv8p0i64(, i32, , ) define @mgather_nxv8i64( %ptrs, %m, %passthru) { @@ -1354,6 +1410,20 @@ define @mgather_truemask_nxv4f16( %ptrs, ret %v } +define @mgather_falsemask_nxv4f16( %ptrs, %passthru) { +; RV32-LABEL: mgather_falsemask_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: vmv1r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_falsemask_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: vmv1r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4f16.nxv4p0f16( %ptrs, i32 2, zeroinitializer, %passthru) + ret %v +} + declare @llvm.masked.gather.nxv8f16.nxv8p0f16(, i32, , ) define @mgather_nxv8f16( %ptrs, %m, %passthru) { @@ -1554,6 +1624,20 @@ define @mgather_truemask_nxv4f32( %ptr ret %v } +define @mgather_falsemask_nxv4f32( %ptrs, %passthru) { +; RV32-LABEL: mgather_falsemask_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: vmv2r.v v8, v10 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_falsemask_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: vmv2r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4f32.nxv4p0f32( %ptrs, i32 4, zeroinitializer, %passthru) + ret %v +} + declare @llvm.masked.gather.nxv8f32.nxv8p0f32(, i32, , ) define @mgather_nxv8f32( %ptrs, %m, %passthru) { @@ -1830,6 +1914,20 @@ define @mgather_truemask_nxv4f64( %p ret %v } +define @mgather_falsemask_nxv4f64( %ptrs, %passthru) { +; RV32-LABEL: mgather_falsemask_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: vmv4r.v v8, v12 +; RV32-NEXT: ret +; +; RV64-LABEL: mgather_falsemask_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: vmv4r.v v8, v12 +; RV64-NEXT: ret + %v = call @llvm.masked.gather.nxv4f64.nxv4p0f64( %ptrs, i32 8, zeroinitializer, %passthru) + ret %v +} + declare @llvm.masked.gather.nxv8f64.nxv8p0f64(, i32, , ) define @mgather_nxv8f64( %ptrs, %m, %passthru) { diff --git a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll index 424ea2f90458..57a9e0019f7a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/mscatter-sdnode.ll @@ -145,6 +145,18 @@ define void @mscatter_truemask_nxv4i8( %val, ret void } +define void @mscatter_falsemask_nxv4i8( %val, %ptrs) { +; RV32-LABEL: mscatter_falsemask_nxv4i8: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_falsemask_nxv4i8: +; RV64: # %bb.0: +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i8.nxv4p0i8( %val, %ptrs, i32 1, zeroinitializer) + ret void +} + declare void @llvm.masked.scatter.nxv8i8.nxv8p0i8(, , i32, ) define void @mscatter_nxv8i8( %val, %ptrs, %m) { @@ -298,6 +310,18 @@ define void @mscatter_truemask_nxv4i16( %val, %val, %ptrs) { +; RV32-LABEL: mscatter_falsemask_nxv4i16: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_falsemask_nxv4i16: +; RV64: # %bb.0: +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i16.nxv4p0i16( %val, %ptrs, i32 2, zeroinitializer) + ret void +} + declare void @llvm.masked.scatter.nxv8i16.nxv8p0i16(, , i32, ) define void @mscatter_nxv8i16( %val, %ptrs, %m) { @@ -501,6 +525,18 @@ define void @mscatter_truemask_nxv4i32( %val, %val, %ptrs) { +; RV32-LABEL: mscatter_falsemask_nxv4i32: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_falsemask_nxv4i32: +; RV64: # %bb.0: +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i32.nxv4p0i32( %val, %ptrs, i32 4, zeroinitializer) + ret void +} + declare void @llvm.masked.scatter.nxv8i32.nxv8p0i32(, , i32, ) define void @mscatter_nxv8i32( %val, %ptrs, %m) { @@ -748,6 +784,18 @@ define void @mscatter_truemask_nxv4i64( %val, %val, %ptrs) { +; RV32-LABEL: mscatter_falsemask_nxv4i64: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_falsemask_nxv4i64: +; RV64: # %bb.0: +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4i64.nxv4p0i64( %val, %ptrs, i32 8, zeroinitializer) + ret void +} + declare void @llvm.masked.scatter.nxv8i64.nxv8p0i64(, , i32, ) define void @mscatter_nxv8i64( %val, %ptrs, %m) { @@ -1054,6 +1102,18 @@ define void @mscatter_truemask_nxv4f16( %val, %val, %ptrs) { +; RV32-LABEL: mscatter_falsemask_nxv4f16: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_falsemask_nxv4f16: +; RV64: # %bb.0: +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f16.nxv4p0f16( %val, %ptrs, i32 2, zeroinitializer) + ret void +} + declare void @llvm.masked.scatter.nxv8f16.nxv8p0f16(, , i32, ) define void @mscatter_nxv8f16( %val, %ptrs, %m) { @@ -1238,6 +1298,18 @@ define void @mscatter_truemask_nxv4f32( %val, %val, %ptrs) { +; RV32-LABEL: mscatter_falsemask_nxv4f32: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_falsemask_nxv4f32: +; RV64: # %bb.0: +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f32.nxv4p0f32( %val, %ptrs, i32 4, zeroinitializer) + ret void +} + declare void @llvm.masked.scatter.nxv8f32.nxv8p0f32(, , i32, ) define void @mscatter_nxv8f32( %val, %ptrs, %m) { @@ -1485,6 +1557,18 @@ define void @mscatter_truemask_nxv4f64( %val, %val, %ptrs) { +; RV32-LABEL: mscatter_falsemask_nxv4f64: +; RV32: # %bb.0: +; RV32-NEXT: ret +; +; RV64-LABEL: mscatter_falsemask_nxv4f64: +; RV64: # %bb.0: +; RV64-NEXT: ret + call void @llvm.masked.scatter.nxv4f64.nxv4p0f64( %val, %ptrs, i32 8, zeroinitializer) + ret void +} + declare void @llvm.masked.scatter.nxv8f64.nxv8p0f64(, , i32, ) define void @mscatter_nxv8f64( %val, %ptrs, %m) { -- GitLab From 80df56f7f9efbf54ac05eb14120eacb6d2c70071 Mon Sep 17 00:00:00 2001 From: Yuanfang Chen Date: Thu, 18 Mar 2021 15:52:14 -0700 Subject: [PATCH 0093/1000] Fix test case in b4a8c0ebb6d4 --- llvm/test/MC/ELF/lto-discard.s | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/test/MC/ELF/lto-discard.s b/llvm/test/MC/ELF/lto-discard.s index 75a7d7ea5e91..8ea8ab5775cd 100644 --- a/llvm/test/MC/ELF/lto-discard.s +++ b/llvm/test/MC/ELF/lto-discard.s @@ -1,17 +1,16 @@ // Check that ".lto_discard" ignores symbol assignments and attribute changes // for the specified symbols. -// RUN: llvm-mc -triple x86_64 < %s | FileCheck %s +// RUN: llvm-mc -triple x86_64-pc-linux-gnu < %s | FileCheck %s // Check that ".lto_discard" only accepts identifiers. -// RUN: not llvm-mc -filetype=obj -triple x86_64 --defsym ERR=1 %s 2>&1 |\ +// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu --defsym ERR=1 %s 2>&1 |\ // RUN: FileCheck %s --check-prefix=ERR -// CHECK: .weak foo -// CHECK: foo: -// CHECK: .byte 1 -// CHECK: .weak bar -// CHECK: bar: -// CHECK: .byte 2 +// CHECK-NOT: .weak foo +// CHECK-NOT: foo: +// CHECK: .weak bar +// CHECK: bar: +// CHECK: .byte 2 .lto_discard foo .weak foo -- GitLab From 5627564fe053bd257385157cea43e795e7c48e3f Mon Sep 17 00:00:00 2001 From: Rob Suderman Date: Wed, 17 Mar 2021 13:41:53 -0700 Subject: [PATCH 0094/1000] [mlir][tosa] Add tosa.concat to subtensor inserts lowering Includes lowering for tosa.concat with indice computation with subtensor insert operations. Includes tests along two different indices. Differential Revision: https://reviews.llvm.org/D98813 --- .../Conversion/TosaToLinalg/CMakeLists.txt | 1 + .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 52 ++++++++++++++++++- .../TosaToLinalg/TosaToLinalgPass.cpp | 8 +-- .../TosaToLinalg/tosa-to-linalg.mlir | 40 ++++++++++++++ 4 files changed, 96 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt b/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt index 8a53b9da025b..a44621ec6033 100644 --- a/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt +++ b/mlir/lib/Conversion/TosaToLinalg/CMakeLists.txt @@ -14,6 +14,7 @@ add_mlir_conversion_library(MLIRTosaToLinalg MLIRLinalg MLIRLinalgUtils MLIRMath + MLIRMemRef MLIRPass MLIRTosa MLIRTosaTransforms diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 2fe4aa31e482..dd2725cbd0fa 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -13,6 +13,7 @@ #include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/IR/Matchers.h" @@ -657,6 +658,53 @@ public: } }; +struct ConcatOpConversion : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult + matchAndRewrite(tosa::ConcatOp op, ArrayRef args, + ConversionPatternRewriter &rewriter) const override { + auto resultType = op.getType().dyn_cast(); + if (!resultType || !resultType.hasStaticShape()) { + return rewriter.notifyMatchFailure(op, + "expected static shaped tensor type"); + } + + Location loc = op.getLoc(); + int axis = op.axis(); + Value axisValue = + rewriter.create(loc, rewriter.getIndexAttr(axis)); + int rank = resultType.getRank(); + SmallVector offsets, sizes, strides; + sizes.reserve(rank); + strides.resize(rank, rewriter.create(loc, 1)); + offsets.resize(rank, rewriter.create(loc, 0)); + + for (int i = 0; i < rank; ++i) { + sizes.push_back(rewriter.create(loc, args[0], i)); + } + + Value resultDimSize = sizes[axis]; + for (auto arg : args.drop_front()) { + auto size = rewriter.create(loc, arg, axisValue); + resultDimSize = rewriter.create(loc, resultDimSize, size); + } + sizes[axis] = resultDimSize; + + Value result = rewriter.create( + loc, resultType.getShape(), resultType.getElementType()); + + for (auto arg : args) { + sizes[axis] = rewriter.create(loc, arg, axisValue); + result = rewriter.create(loc, arg, result, offsets, + sizes, strides); + offsets[axis] = rewriter.create(loc, offsets[axis], sizes[axis]); + } + rewriter.replaceOp(op, result); + return success(); + } +}; + } // namespace void mlir::tosa::populateTosaToLinalgOnTensorsConversionPatterns( @@ -680,6 +728,6 @@ void mlir::tosa::populateTosaToLinalgOnTensorsConversionPatterns( IdentityNConverter, IdentityNConverter, ReduceConverter, ReduceConverter, ReduceConverter, - ReduceConverter, ReshapeOpConverter, - TransposeConverter>(context); + ReduceConverter, ConcatOpConversion, + ReshapeOpConverter, TransposeConverter>(context); } diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp index 8ccf83529457..a1bd694f67af 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp @@ -14,6 +14,7 @@ #include "mlir/Conversion/TosaToLinalg/TosaToLinalg.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Tosa/IR/TosaOps.h" #include "mlir/Dialect/Tosa/Transforms/PassDetail.h" @@ -31,14 +32,15 @@ struct TosaToLinalgOnTensors : public TosaToLinalgOnTensorsBase { public: void getDependentDialects(DialectRegistry ®istry) const override { - registry - .insert(); + registry.insert(); } void runOnFunction() override { OwningRewritePatternList patterns; ConversionTarget target(getContext()); - target.addLegalDialect(); + target.addLegalDialect(); target.addIllegalDialect(); target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index d1868e7683ce..9b1f6054ee06 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -433,3 +433,43 @@ func @reduce_int(%arg0: tensor<5x4xi32>) -> () { %4 = "tosa.reduce_max"(%arg0) {axis = 0 : i64} : (tensor<5x4xi32>) -> tensor<4xi32> return } + +// ----- + +// CHECK-LABEL: @concat +func @concat(%arg0: tensor<5x1xf32>, %arg1: tensor<6x1xf32>) -> () { + // CHECK: [[AXIS:%.+]] = constant 0 + // CHECK: [[STRIDE:%.+]] = constant 1 + // CHECK: [[OFFSET:%.+]] = constant 0 : index + // CHECK: [[IDX0:%.+]] = constant 0 : index + // CHECK: [[ARG0_DIM0:%.+]] = memref.dim %arg0, [[IDX0]] + // CHECK: [[IDX1:%.+]] = constant 1 : index + // CHECK: [[ARG0_DIM1:%.+]] = memref.dim %arg0, [[IDX1]] + // CHECK: [[ARG1_AXIS:%.+]] = memref.dim %arg1, [[AXIS]] + // CHECK: [[RESULT_AXIS:%.+]] = addi [[ARG0_DIM0]], [[ARG1_AXIS]] + // CHECK: [[INIT:%.+]] = linalg.init_tensor [11, 1] + // CHECK: [[ARG0_DIM0:%.+]] = memref.dim %arg0, [[AXIS]] + // CHECK: [[INSERT0:%.+]] = subtensor_insert %arg0 into [[INIT]]{{\[}}[[OFFSET]], [[OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]] + // CHECK: [[NEW_OFFSET:%.+]] = addi [[OFFSET]], [[ARG0_DIM0]] + // CHECK: [[ARG1_DIM0:%.+]] = memref.dim %arg1, [[AXIS]] + // CHECK: [[INSERT1:%.+]] = subtensor_insert %arg1 into [[INSERT0]]{{\[}}[[NEW_OFFSET]], [[OFFSET]]] {{\[}}[[ARG1_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]] + %0 = "tosa.concat"(%arg0, %arg1) { axis = 0 : i64} : (tensor<5x1xf32>, tensor<6x1xf32>) -> (tensor<11x1xf32>) + + // CHECK: [[AXIS:%.+]] = constant 1 + // CHECK: [[STRIDE:%.+]] = constant 1 + // CHECK: [[OFFSET:%.+]] = constant 0 : index + // CHECK: [[IDX0:%.+]] = constant 0 : index + // CHECK: [[ARG0_DIM0:%.+]] = memref.dim %arg0, [[IDX0]] + // CHECK: [[IDX1:%.+]] = constant 1 : index + // CHECK: [[ARG0_DIM1:%.+]] = memref.dim %arg0, [[IDX1]] + // CHECK: [[ARG1_AXIS:%.+]] = memref.dim %arg0, [[AXIS]] + // CHECK: [[RESULT_AXIS:%.+]] = addi [[ARG0_DIM1]], [[ARG1_AXIS]] + // CHECK: [[INIT:%.+]] = linalg.init_tensor [5, 2] + // CHECK: [[ARG0_DIM1:%.+]] = memref.dim %arg0, [[AXIS]] + // CHECK: [[INSERT0:%.+]] = subtensor_insert %arg0 into [[INIT]]{{\[}}[[OFFSET]], [[OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]] + // CHECK: [[NEW_OFFSET:%.+]] = addi [[OFFSET]], [[ARG0_DIM1]] + // CHECK: [[ARG1_DIM1:%.+]] = memref.dim %arg0, [[AXIS]] + // CHECK: [[INSERT1:%.+]] = subtensor_insert %arg0 into [[INSERT0]]{{\[}}[[OFFSET]], [[NEW_OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG1_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]] + %1 = "tosa.concat"(%arg0, %arg0) { axis = 1 : i64} : (tensor<5x1xf32>, tensor<5x1xf32>) -> (tensor<5x2xf32>) + return +} -- GitLab From 0ca83730cc2bb19a871bb5fd37127a639f488924 Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Thu, 18 Mar 2021 15:34:37 -0700 Subject: [PATCH 0095/1000] Recommit "[AArch64][GlobalISel] Fold constants into G_GLOBAL_VALUE" This reverts commit 962b73dd0fc3906980e597f72a35eee7121cc5e2. This commit was reverted because of some internal SPEC test failures. It turns out that this wasn't actually relevant to anything in open source, so it's safe to recommit this. --- llvm/lib/Target/AArch64/AArch64Combine.td | 12 +- .../GISel/AArch64InstructionSelector.cpp | 8 +- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 10 +- .../GISel/AArch64PreLegalizerCombiner.cpp | 110 +++++++ .../fold-global-offsets-target-features.mir | 241 +++++++++++++++ .../GlobalISel/fold-global-offsets.mir | 284 ++++++++++++++++++ .../GlobalISel/legalize-global-pic.mir | 24 +- .../AArch64/GlobalISel/legalize-global.mir | 26 +- .../AArch64/GlobalISel/select-add-low.mir | 70 +++++ .../GlobalISel/select-gv-with-offset.mir | 38 +++ .../AArch64/GlobalISel/select-store.mir | 43 ++- .../CodeGen/AArch64/fold-global-offsets.ll | 139 +++++++-- 12 files changed, 955 insertions(+), 50 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/fold-global-offsets-target-features.mir create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/fold-global-offsets.mir create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/select-add-low.mir create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/select-gv-with-offset.mir diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 144e6b747f51..d5ea2d3eee98 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -24,10 +24,20 @@ def icmp_redundant_trunc : GICombineRule< [{ return matchICmpRedundantTrunc(*${root}, MRI, Helper.getKnownBits(), ${matchinfo}); }]), (apply [{ applyICmpRedundantTrunc(*${root}, MRI, B, Observer, ${matchinfo}); }])>; +// AArch64-specific offset folding for G_GLOBAL_VALUE. +def fold_global_offset_matchdata : GIDefMatchData<"std::pair">; +def fold_global_offset : GICombineRule< + (defs root:$root, fold_global_offset_matchdata:$matchinfo), + (match (wip_match_opcode G_GLOBAL_VALUE):$root, + [{ return matchFoldGlobalOffset(*${root}, MRI, ${matchinfo}); }]), + (apply [{ return applyFoldGlobalOffset(*${root}, MRI, B, Observer, ${matchinfo});}]) +>; + def AArch64PreLegalizerCombinerHelper: GICombinerHelper< "AArch64GenPreLegalizerCombinerHelper", [all_combines, fconstant_to_constant, - icmp_redundant_trunc]> { + icmp_redundant_trunc, + fold_global_offset]> { let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule"; let StateClass = "AArch64PreLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp index 3b099d0b91bf..68c2e1e95048 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -5654,8 +5654,10 @@ AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, return None; // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG. - // TODO: Need to check GV's offset % size if doing offset folding into globals. - assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global"); + auto Offset = Adrp.getOperand(1).getOffset(); + if (Offset % Size != 0) + return None; + auto GV = Adrp.getOperand(1).getGlobal(); if (GV->isThreadLocal()) return None; @@ -5669,7 +5671,7 @@ AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef, Register AdrpReg = Adrp.getOperand(0).getReg(); return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); }, [=](MachineInstrBuilder &MIB) { - MIB.addGlobalAddress(GV, /* Offset */ 0, + MIB.addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); }}}; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index b01f1fee9ea3..83ffe09612bb 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -785,7 +785,8 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( // G_ADD_LOW instructions. // By splitting this here, we can optimize accesses in the small code model by // folding in the G_ADD_LOW into the load/store offset. - auto GV = MI.getOperand(1).getGlobal(); + auto &GlobalOp = MI.getOperand(1); + const auto* GV = GlobalOp.getGlobal(); if (GV->isThreadLocal()) return true; // Don't want to modify TLS vars. @@ -795,9 +796,10 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( if (OpFlags & AArch64II::MO_GOT) return true; + auto Offset = GlobalOp.getOffset(); Register DstReg = MI.getOperand(0).getReg(); auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {}) - .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE); + .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGE); // Set the regclass on the dest reg too. MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass); @@ -815,6 +817,8 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( // binary must also be loaded into address range [0, 2^48). Both of these // properties need to be ensured at runtime when using tagged addresses. if (OpFlags & AArch64II::MO_TAGGED) { + assert(!Offset && + "Should not have folded in an offset for a tagged global!"); ADRP = MIRBuilder.buildInstr(AArch64::MOVKXi, {LLT::pointer(0, 64)}, {ADRP}) .addGlobalAddress(GV, 0x100000000, AArch64II::MO_PREL | AArch64II::MO_G3) @@ -823,7 +827,7 @@ bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue( } MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP}) - .addGlobalAddress(GV, 0, + .addGlobalAddress(GV, Offset, OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp index 6e7fe7c98512..26029b4db11f 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp @@ -107,6 +107,116 @@ static bool applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, return true; } +/// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE. +/// +/// e.g. +/// +/// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst +static bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, + std::pair &MatchInfo) { + assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE); + MachineFunction &MF = *MI.getMF(); + auto &GlobalOp = MI.getOperand(1); + auto *GV = GlobalOp.getGlobal(); + + // Don't allow anything that could represent offsets etc. + if (MF.getSubtarget().ClassifyGlobalReference( + GV, MF.getTarget()) != AArch64II::MO_NO_FLAG) + return false; + + // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants: + // + // %g = G_GLOBAL_VALUE @x + // %ptr1 = G_PTR_ADD %g, cst1 + // %ptr2 = G_PTR_ADD %g, cst2 + // ... + // %ptrN = G_PTR_ADD %g, cstN + // + // Identify the *smallest* constant. We want to be able to form this: + // + // %offset_g = G_GLOBAL_VALUE @x + min_cst + // %g = G_PTR_ADD %offset_g, -min_cst + // %ptr1 = G_PTR_ADD %g, cst1 + // ... + Register Dst = MI.getOperand(0).getReg(); + uint64_t MinOffset = -1ull; + for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) { + if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD) + return false; + auto Cst = + getConstantVRegValWithLookThrough(UseInstr.getOperand(2).getReg(), MRI); + if (!Cst) + return false; + MinOffset = std::min(MinOffset, Cst->Value.getZExtValue()); + } + + // Require that the new offset is larger than the existing one to avoid + // infinite loops. + uint64_t CurrOffset = GlobalOp.getOffset(); + uint64_t NewOffset = MinOffset + CurrOffset; + if (NewOffset <= CurrOffset) + return false; + + // Check whether folding this offset is legal. It must not go out of bounds of + // the referenced object to avoid violating the code model, and must be + // smaller than 2^21 because this is the largest offset expressible in all + // object formats. + // + // This check also prevents us from folding negative offsets, which will end + // up being treated in the same way as large positive ones. They could also + // cause code model violations, and aren't really common enough to matter. + if (NewOffset >= (1 << 21)) + return false; + + Type *T = GV->getValueType(); + if (!T->isSized() || + NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T)) + return false; + MatchInfo = std::make_pair(NewOffset, MinOffset); + return true; +} + +static bool applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B, + GISelChangeObserver &Observer, + std::pair &MatchInfo) { + // Change: + // + // %g = G_GLOBAL_VALUE @x + // %ptr1 = G_PTR_ADD %g, cst1 + // %ptr2 = G_PTR_ADD %g, cst2 + // ... + // %ptrN = G_PTR_ADD %g, cstN + // + // To: + // + // %offset_g = G_GLOBAL_VALUE @x + min_cst + // %g = G_PTR_ADD %offset_g, -min_cst + // %ptr1 = G_PTR_ADD %g, cst1 + // ... + // %ptrN = G_PTR_ADD %g, cstN + // + // Then, the original G_PTR_ADDs should be folded later on so that they look + // like this: + // + // %ptrN = G_PTR_ADD %offset_g, cstN - min_cst + uint64_t Offset, MinOffset; + std::tie(Offset, MinOffset) = MatchInfo; + B.setInstrAndDebugLoc(MI); + Observer.changingInstr(MI); + auto &GlobalOp = MI.getOperand(1); + auto *GV = GlobalOp.getGlobal(); + GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags()); + Register Dst = MI.getOperand(0).getReg(); + Register NewGVDst = MRI.cloneVirtualRegister(Dst); + MI.getOperand(0).setReg(NewGVDst); + Observer.changedInstr(MI); + B.buildPtrAdd( + Dst, NewGVDst, + B.buildConstant(LLT::scalar(64), -static_cast(MinOffset))); + return true; +} + class AArch64PreLegalizerCombinerHelperState { protected: CombinerHelper &Helper; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/fold-global-offsets-target-features.mir b/llvm/test/CodeGen/AArch64/GlobalISel/fold-global-offsets-target-features.mir new file mode 100644 index 000000000000..639c51d92d9c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/fold-global-offsets-target-features.mir @@ -0,0 +1,241 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=DEFAULT,CHECK +# RUN: llc -mtriple aarch64-apple-darwin -code-model=large -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=LARGE-MACHO,CHECK +# RUN: llc -mtriple aarch64-apple-darwin -code-model=small -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=SMALL-MACHO,CHECK +# RUN: llc -mtriple aarch64-linux-elf -code-model=large -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=LARGE-ELF,CHECK +# RUN: llc -mtriple aarch64-linux-elf -code-model=tiny -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=TINY,CHECK +# RUN: llc -mtriple aarch64-windows-coff -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=WINDOWS,CHECK + +# Each of these tests has a trivial pattern for folding a G_PTR_ADD into a +# G_GLOBAL_VALUE. +# +# Check that given different code models/target features, we do/don't fold. + +--- | + @external_linkage = external hidden global i32 + @common_linkage = common local_unnamed_addr global i32 0, align 4 + @internal_linkage = internal unnamed_addr global i32 0, align 4 + @extern_weak_linkage = extern_weak hidden global i32 + @dll_import = external dllimport global i32 + + define void @test_external_linkage() { ret void } + define void @test_internal_linkage() { ret void } + define void @test_common_linkage() { ret void } + define void @test_extern_weak_linkage() { ret void } + define void @never_fold_tagged_globals() #0 { ret void } + define void @test_dll_import() { ret void } + + attributes #0 = { "target-features"="+tagged-globals" } +... +--- +name: test_external_linkage +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; Large + Mach-O goes via GOT, so we can't fold. + + ; DEFAULT-LABEL: name: test_external_linkage + ; DEFAULT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_linkage + 1 + ; DEFAULT: $x0 = COPY [[GV]](p0) + ; DEFAULT: RET_ReallyLR implicit $x0 + ; LARGE-MACHO-LABEL: name: test_external_linkage + ; LARGE-MACHO: %global:_(p0) = G_GLOBAL_VALUE @external_linkage + ; LARGE-MACHO: %imm:_(s64) = G_CONSTANT i64 1 + ; LARGE-MACHO: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; LARGE-MACHO: $x0 = COPY %ptr_add(p0) + ; LARGE-MACHO: RET_ReallyLR implicit $x0 + ; SMALL-MACHO-LABEL: name: test_external_linkage + ; SMALL-MACHO: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_linkage + 1 + ; SMALL-MACHO: $x0 = COPY [[GV]](p0) + ; SMALL-MACHO: RET_ReallyLR implicit $x0 + ; LARGE-ELF-LABEL: name: test_external_linkage + ; LARGE-ELF: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_linkage + 1 + ; LARGE-ELF: $x0 = COPY [[GV]](p0) + ; LARGE-ELF: RET_ReallyLR implicit $x0 + ; TINY-LABEL: name: test_external_linkage + ; TINY: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_linkage + 1 + ; TINY: $x0 = COPY [[GV]](p0) + ; TINY: RET_ReallyLR implicit $x0 + ; WINDOWS-LABEL: name: test_external_linkage + ; WINDOWS: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_linkage + 1 + ; WINDOWS: $x0 = COPY [[GV]](p0) + ; WINDOWS: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @external_linkage + %imm:_(s64) = G_CONSTANT i64 1 + %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + $x0 = COPY %ptr_add(p0) + RET_ReallyLR implicit $x0 + +... +--- +name: test_internal_linkage +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; Large + Mach-O goes via GOT, so we can't fold. + + ; DEFAULT-LABEL: name: test_internal_linkage + ; DEFAULT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @internal_linkage + 1 + ; DEFAULT: $x0 = COPY [[GV]](p0) + ; DEFAULT: RET_ReallyLR implicit $x0 + ; LARGE-MACHO-LABEL: name: test_internal_linkage + ; LARGE-MACHO: %global:_(p0) = G_GLOBAL_VALUE @internal_linkage + ; LARGE-MACHO: %imm:_(s64) = G_CONSTANT i64 1 + ; LARGE-MACHO: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; LARGE-MACHO: $x0 = COPY %ptr_add(p0) + ; LARGE-MACHO: RET_ReallyLR implicit $x0 + ; SMALL-MACHO-LABEL: name: test_internal_linkage + ; SMALL-MACHO: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @internal_linkage + 1 + ; SMALL-MACHO: $x0 = COPY [[GV]](p0) + ; SMALL-MACHO: RET_ReallyLR implicit $x0 + ; LARGE-ELF-LABEL: name: test_internal_linkage + ; LARGE-ELF: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @internal_linkage + 1 + ; LARGE-ELF: $x0 = COPY [[GV]](p0) + ; LARGE-ELF: RET_ReallyLR implicit $x0 + ; TINY-LABEL: name: test_internal_linkage + ; TINY: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @internal_linkage + 1 + ; TINY: $x0 = COPY [[GV]](p0) + ; TINY: RET_ReallyLR implicit $x0 + ; WINDOWS-LABEL: name: test_internal_linkage + ; WINDOWS: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @internal_linkage + 1 + ; WINDOWS: $x0 = COPY [[GV]](p0) + ; WINDOWS: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @internal_linkage + %imm:_(s64) = G_CONSTANT i64 1 + %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + $x0 = COPY %ptr_add(p0) + RET_ReallyLR implicit $x0 + +... +--- +name: test_common_linkage +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; DEFAULT-LABEL: name: test_common_linkage + ; DEFAULT: %global:_(p0) = G_GLOBAL_VALUE @common_linkage + ; DEFAULT: %imm:_(s64) = G_CONSTANT i64 1 + ; DEFAULT: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; DEFAULT: $x0 = COPY %ptr_add(p0) + ; DEFAULT: RET_ReallyLR implicit $x0 + ; LARGE-MACHO-LABEL: name: test_common_linkage + ; LARGE-MACHO: %global:_(p0) = G_GLOBAL_VALUE @common_linkage + ; LARGE-MACHO: %imm:_(s64) = G_CONSTANT i64 1 + ; LARGE-MACHO: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; LARGE-MACHO: $x0 = COPY %ptr_add(p0) + ; LARGE-MACHO: RET_ReallyLR implicit $x0 + ; SMALL-MACHO-LABEL: name: test_common_linkage + ; SMALL-MACHO: %global:_(p0) = G_GLOBAL_VALUE @common_linkage + ; SMALL-MACHO: %imm:_(s64) = G_CONSTANT i64 1 + ; SMALL-MACHO: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; SMALL-MACHO: $x0 = COPY %ptr_add(p0) + ; SMALL-MACHO: RET_ReallyLR implicit $x0 + ; LARGE-ELF-LABEL: name: test_common_linkage + ; LARGE-ELF: %global:_(p0) = G_GLOBAL_VALUE @common_linkage + ; LARGE-ELF: %imm:_(s64) = G_CONSTANT i64 1 + ; LARGE-ELF: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; LARGE-ELF: $x0 = COPY %ptr_add(p0) + ; LARGE-ELF: RET_ReallyLR implicit $x0 + ; TINY-LABEL: name: test_common_linkage + ; TINY: %global:_(p0) = G_GLOBAL_VALUE @common_linkage + ; TINY: %imm:_(s64) = G_CONSTANT i64 1 + ; TINY: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; TINY: $x0 = COPY %ptr_add(p0) + ; TINY: RET_ReallyLR implicit $x0 + ; WINDOWS-LABEL: name: test_common_linkage + ; WINDOWS: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @common_linkage + 1 + ; WINDOWS: $x0 = COPY [[GV]](p0) + ; WINDOWS: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @common_linkage + %imm:_(s64) = G_CONSTANT i64 1 + %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + $x0 = COPY %ptr_add(p0) + RET_ReallyLR implicit $x0 + +... +--- +name: test_extern_weak_linkage +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; DEFAULT-LABEL: name: test_extern_weak_linkage + ; DEFAULT: %global:_(p0) = G_GLOBAL_VALUE @extern_weak_linkage + ; DEFAULT: %imm:_(s64) = G_CONSTANT i64 1 + ; DEFAULT: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; DEFAULT: $x0 = COPY %ptr_add(p0) + ; DEFAULT: RET_ReallyLR implicit $x0 + ; LARGE-MACHO-LABEL: name: test_extern_weak_linkage + ; LARGE-MACHO: %global:_(p0) = G_GLOBAL_VALUE @extern_weak_linkage + ; LARGE-MACHO: %imm:_(s64) = G_CONSTANT i64 1 + ; LARGE-MACHO: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; LARGE-MACHO: $x0 = COPY %ptr_add(p0) + ; LARGE-MACHO: RET_ReallyLR implicit $x0 + ; SMALL-MACHO-LABEL: name: test_extern_weak_linkage + ; SMALL-MACHO: %global:_(p0) = G_GLOBAL_VALUE @extern_weak_linkage + ; SMALL-MACHO: %imm:_(s64) = G_CONSTANT i64 1 + ; SMALL-MACHO: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; SMALL-MACHO: $x0 = COPY %ptr_add(p0) + ; SMALL-MACHO: RET_ReallyLR implicit $x0 + ; LARGE-ELF-LABEL: name: test_extern_weak_linkage + ; LARGE-ELF: %global:_(p0) = G_GLOBAL_VALUE @extern_weak_linkage + ; LARGE-ELF: %imm:_(s64) = G_CONSTANT i64 1 + ; LARGE-ELF: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; LARGE-ELF: $x0 = COPY %ptr_add(p0) + ; LARGE-ELF: RET_ReallyLR implicit $x0 + ; TINY-LABEL: name: test_extern_weak_linkage + ; TINY: %global:_(p0) = G_GLOBAL_VALUE @extern_weak_linkage + ; TINY: %imm:_(s64) = G_CONSTANT i64 1 + ; TINY: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; TINY: $x0 = COPY %ptr_add(p0) + ; TINY: RET_ReallyLR implicit $x0 + ; WINDOWS-LABEL: name: test_extern_weak_linkage + ; WINDOWS: %global:_(p0) = G_GLOBAL_VALUE @extern_weak_linkage + ; WINDOWS: %imm:_(s64) = G_CONSTANT i64 1 + ; WINDOWS: %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + ; WINDOWS: $x0 = COPY %ptr_add(p0) + ; WINDOWS: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @extern_weak_linkage + %imm:_(s64) = G_CONSTANT i64 1 + %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + $x0 = COPY %ptr_add(p0) + RET_ReallyLR implicit $x0 + +... +--- +name: never_fold_tagged_globals +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; CHECK-LABEL: name: never_fold_tagged_globals + ; CHECK-NOT: %global:_(p0) = G_GLOBAL_VALUE @external_linkage + 1 + %global:_(p0) = G_GLOBAL_VALUE @external_linkage + %imm:_(s64) = G_CONSTANT i64 1 + %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + $x0 = COPY %ptr_add(p0) + RET_ReallyLR implicit $x0 + +... +--- +name: test_dll_import +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; CHECK-LABEL: name: test_dll_import + ; CHECK-NOT: %global:_(p0) = G_GLOBAL_VALUE @dll_import + 1 + %global:_(p0) = G_GLOBAL_VALUE @dll_import + %imm:_(s64) = G_CONSTANT i64 1 + %ptr_add:_(p0) = G_PTR_ADD %global, %imm(s64) + $x0 = COPY %ptr_add(p0) + RET_ReallyLR implicit $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/fold-global-offsets.mir b/llvm/test/CodeGen/AArch64/GlobalISel/fold-global-offsets.mir new file mode 100644 index 000000000000..514cef0e703d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/fold-global-offsets.mir @@ -0,0 +1,284 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64-apple-darwin -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +--- | + @g = external hidden global i32 + + %opaque = type opaque + @unsized = external hidden global %opaque + + define void @one_ptr_add() { ret void } + define void @add_to_offset() { ret void } + define void @two_ptr_adds_same_offset() { ret void } + define void @two_ptr_adds_different_offset() { ret void } + define void @ptr_add_chain() { ret void } + + define void @dont_fold_negative_offset() { ret void } + define void @dont_min_offset_less_than_curr_offset() { ret void } + define void @dont_fold_max_offset() { ret void } + define void @dont_fold_offset_larger_than_type_alloc() { ret void } + define void @dont_fold_unsized_type() { ret void } +... +--- +name: one_ptr_add +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + + ; We should fold the offset 1 into the G_GLOBAL_VALUE. + + ; CHECK-LABEL: name: one_ptr_add + ; CHECK: liveins: $x0 + ; CHECK: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @g + 1 + ; CHECK: $x0 = COPY [[GV]](p0) + ; CHECK: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @g + %offset:_(s64) = G_CONSTANT i64 1 + %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + $x0 = COPY %ptr_add + RET_ReallyLR implicit $x0 + +... +--- +name: add_to_offset +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + + ; We should fold the offset 1 into the G_GLOBAL_VALUE, resulting in a + ; final offset of 4. + + ; CHECK-LABEL: name: add_to_offset + ; CHECK: liveins: $x0 + ; CHECK: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @g + 4 + ; CHECK: $x0 = COPY [[GV]](p0) + ; CHECK: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @g + 3 + %offset:_(s64) = G_CONSTANT i64 1 + %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + $x0 = COPY %ptr_add + RET_ReallyLR implicit $x0 + +... +--- +name: two_ptr_adds_same_offset +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1 + + ; We're allowed to have more than one G_PTR_ADD use. We should fold 1 into + ; the G_GLOBAL_VALUE's offset. + + ; CHECK-LABEL: name: two_ptr_adds_same_offset + ; CHECK: liveins: $x0, $x1 + ; CHECK: %val1:_(s64) = COPY $x0 + ; CHECK: %val2:_(s64) = COPY $x1 + ; CHECK: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @g + 1 + ; CHECK: G_STORE %val1(s64), [[GV]](p0) :: (store 8) + ; CHECK: G_STORE %val2(s64), [[GV]](p0) :: (store 8) + ; CHECK: RET_ReallyLR implicit $x0 + %val1:_(s64) = COPY $x0 + %val2:_(s64) = COPY $x1 + %global:_(p0) = G_GLOBAL_VALUE @g + %offset:_(s64) = G_CONSTANT i64 1 + %ptr_add1:_(p0) = G_PTR_ADD %global, %offset(s64) + %ptr_add2:_(p0) = G_PTR_ADD %global, %offset(s64) + G_STORE %val1:_(s64), %ptr_add1 :: (store 8) + G_STORE %val2:_(s64), %ptr_add2 :: (store 8) + RET_ReallyLR implicit $x0 + +... +--- +name: two_ptr_adds_different_offset +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0, $x1 + ; The lowest offset G_PTR_ADD (2) should be folded into the G_GLOBAL_VALUE. + ; + ; The other G_PTR_ADD should have its offset decremented by 2. + + ; CHECK-LABEL: name: two_ptr_adds_different_offset + ; CHECK: liveins: $x0, $x1 + ; CHECK: %val1:_(s64) = COPY $x0 + ; CHECK: %val2:_(s64) = COPY $x1 + ; CHECK: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @g + 2 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: %ptr_add2:_(p0) = G_PTR_ADD [[GV]], [[C]](s64) + ; CHECK: G_STORE %val1(s64), [[GV]](p0) :: (store 8) + ; CHECK: G_STORE %val2(s64), %ptr_add2(p0) :: (store 8) + ; CHECK: RET_ReallyLR implicit $x0 + %val1:_(s64) = COPY $x0 + %val2:_(s64) = COPY $x1 + %global:_(p0) = G_GLOBAL_VALUE @g + %offset1:_(s64) = G_CONSTANT i64 2 + %offset2:_(s64) = G_CONSTANT i64 10 + %ptr_add1:_(p0) = G_PTR_ADD %global, %offset1(s64) + %ptr_add2:_(p0) = G_PTR_ADD %global, %offset2(s64) + G_STORE %val1:_(s64), %ptr_add1 :: (store 8) + G_STORE %val2:_(s64), %ptr_add2 :: (store 8) + RET_ReallyLR implicit $x0 + +... +--- +name: ptr_add_chain +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; We should be able to fold all of the G_PTR_ADDs, except for the last one + ; into the G_GLOBAL_VALUE. + ; + ; (TypeAllocSize = 4, so the offset on the G_GLOBAL_VALUE can't go above + ; that.) + + ; CHECK-LABEL: name: ptr_add_chain + ; CHECK: liveins: $x0 + ; CHECK: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @g + 1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK: %dont_fold_me:_(p0) = G_PTR_ADD [[GV]], [[C]](s64) + ; CHECK: $x0 = COPY %dont_fold_me(p0) + ; CHECK: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @g + %offset:_(s64) = G_CONSTANT i64 1 + %ptr_add1:_(p0) = G_PTR_ADD %global, %offset(s64) + %ptr_add2:_(p0) = G_PTR_ADD %ptr_add1, %offset(s64) + %ptr_add3:_(p0) = G_PTR_ADD %ptr_add2, %offset(s64) + %ptr_add4:_(p0) = G_PTR_ADD %ptr_add3, %offset(s64) + %dont_fold_me:_(p0) = G_PTR_ADD %ptr_add4, %offset(s64) + $x0 = COPY %dont_fold_me + RET_ReallyLR implicit $x0 + +... +--- +name: dont_fold_negative_offset +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + + ; Do not add negative offsets to G_GLOBAL_VALUE. + + ; CHECK-LABEL: name: dont_fold_negative_offset + ; CHECK: liveins: $x0 + ; CHECK: %global:_(p0) = G_GLOBAL_VALUE @g + ; CHECK: %offset:_(s64) = G_CONSTANT i64 -1 + ; CHECK: %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + ; CHECK: $x0 = COPY %ptr_add(p0) + ; CHECK: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @g + %offset:_(s64) = G_CONSTANT i64 -1 + %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + $x0 = COPY %ptr_add + RET_ReallyLR implicit $x0 + +... +--- +name: dont_min_offset_less_than_curr_offset +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + + ; Do not create smaller offsets. Ensures combine termination. + + ; CHECK-LABEL: name: dont_min_offset_less_than_curr_offset + ; CHECK: liveins: $x0 + ; CHECK: %global:_(p0) = G_GLOBAL_VALUE @g + 3 + ; CHECK: %offset:_(s64) = G_CONSTANT i64 -1 + ; CHECK: %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + ; CHECK: $x0 = COPY %ptr_add(p0) + ; CHECK: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @g + 3 + %offset:_(s64) = G_CONSTANT i64 -1 + %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + $x0 = COPY %ptr_add + RET_ReallyLR implicit $x0 + +... +--- +name: dont_fold_max_offset +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + + ; 1 << 21 is the largest offset expressible in all object formats. + ; Don't fold it. + + ; CHECK-LABEL: name: dont_fold_max_offset + ; CHECK: liveins: $x0 + ; CHECK: %global:_(p0) = G_GLOBAL_VALUE @g + ; CHECK: %offset:_(s64) = G_CONSTANT i64 4292870144 + ; CHECK: %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + ; CHECK: $x0 = COPY %ptr_add(p0) + ; CHECK: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @g + %offset:_(s64) = G_CONSTANT i64 4292870144 ; 1 << 21 + %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + $x0 = COPY %ptr_add + RET_ReallyLR implicit $x0 + +... +--- +name: dont_fold_offset_larger_than_type_alloc +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + + ; Type alloc size = 4, offset = 16. Don't fold. + + ; CHECK-LABEL: name: dont_fold_offset_larger_than_type_alloc + ; CHECK: %global:_(p0) = G_GLOBAL_VALUE @g + ; CHECK: %offset:_(s64) = G_CONSTANT i64 16 + ; CHECK: %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + ; CHECK: $x0 = COPY %ptr_add(p0) + ; CHECK: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @g + %offset:_(s64) = G_CONSTANT i64 16 + %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + $x0 = COPY %ptr_add(p0) + RET_ReallyLR implicit $x0 + +... +--- +name: dont_fold_unsized_type +alignment: 4 +tracksRegLiveness: true +machineFunctionInfo: {} +body: | + bb.0: + ; Check that we don't touch unsized globals. + + ; CHECK-LABEL: name: dont_fold_unsized_type + ; CHECK: %global:_(p0) = G_GLOBAL_VALUE @unsized + ; CHECK: %offset:_(s64) = G_CONSTANT i64 16 + ; CHECK: %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + ; CHECK: $x0 = COPY %ptr_add(p0) + ; CHECK: RET_ReallyLR implicit $x0 + %global:_(p0) = G_GLOBAL_VALUE @unsized + %offset:_(s64) = G_CONSTANT i64 16 + %ptr_add:_(p0) = G_PTR_ADD %global, %offset(s64) + $x0 = COPY %ptr_add(p0) + RET_ReallyLR implicit $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-global-pic.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-global-pic.mir index 3fbd0125b31f..706bab2d0092 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-global-pic.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-global-pic.mir @@ -6,6 +6,7 @@ target triple = "aarch64--" @var = external global i8 define i8* @test_global() { ret i8* undef } + define i8* @test_global_with_offset() { ret i8* undef } ... --- name: test_global @@ -17,15 +18,6 @@ body: | ; We don't want to lower to G_ADD_LOW when we need a GOT access, or when the code ; model isn't 'Small'. - ; CHECK-LABEL: name: test_global - ; CHECK: [[ADRP:%[0-9]+]]:gpr64(p0) = ADRP target-flags(aarch64-page) @var - ; CHECK: [[ADD_LOW:%[0-9]+]]:_(p0) = G_ADD_LOW [[ADRP]](p0), target-flags(aarch64-pageoff, aarch64-nc) @var - ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[ADD_LOW]](p0) - ; CHECK: $x0 = COPY [[PTRTOINT]](s64) - ; CMLARGE-LABEL: name: test_global - ; CMLARGE: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var - ; CMLARGE: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[GV]](p0) - ; CMLARGE: $x0 = COPY [[PTRTOINT]](s64) ; PIC-LABEL: name: test_global ; PIC: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var ; PIC: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[GV]](p0) @@ -34,3 +26,17 @@ body: | %1:_(s64) = G_PTRTOINT %0 $x0 = COPY %1 ... +--- +name: test_global_with_offset +registers: + - { id: 0, class: _ } +body: | + bb.0: + ; PIC-LABEL: name: test_global_with_offset + ; PIC: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var + 1 + ; PIC: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[GV]](p0) + ; PIC: $x0 = COPY [[PTRTOINT]](s64) + %0(p0) = G_GLOBAL_VALUE @var + 1 + %1:_(s64) = G_PTRTOINT %0 + $x0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-global.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-global.mir index da84fb43ca93..4338db9df94a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-global.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-global.mir @@ -7,6 +7,7 @@ target triple = "aarch64--" @var = external dso_local global i8 define i8* @test_global() { ret i8* undef } + define i8* @test_global_with_offset() { ret i8* undef } ... --- name: test_global @@ -17,16 +18,11 @@ body: | ; We don't want to lower to G_ADD_LOW when we need a GOT access, or when the code ; model isn't 'Small'. - ; CHECK-LABEL: name: test_global ; CHECK: [[ADRP:%[0-9]+]]:gpr64(p0) = ADRP target-flags(aarch64-page) @var ; CHECK: [[ADD_LOW:%[0-9]+]]:_(p0) = G_ADD_LOW [[ADRP]](p0), target-flags(aarch64-pageoff, aarch64-nc) @var ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[ADD_LOW]](p0) ; CHECK: $x0 = COPY [[PTRTOINT]](s64) - ; PIC-LABEL: name: test_global - ; PIC: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var - ; PIC: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[GV]](p0) - ; PIC: $x0 = COPY [[PTRTOINT]](s64) ; CMLARGE-LABEL: name: test_global ; CMLARGE: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var ; CMLARGE: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[GV]](p0) @@ -35,3 +31,23 @@ body: | %1:_(s64) = G_PTRTOINT %0 $x0 = COPY %1 ... +--- +name: test_global_with_offset +body: | + bb.0: + ; When we legalize into ADRP + G_ADD_LOW, both should inherit the offset + ; from the original G_GLOBAL_VALUE. + ; + ; CHECK-LABEL: name: test_global_with_offset + ; CHECK: [[ADRP:%[0-9]+]]:gpr64(p0) = ADRP target-flags(aarch64-page) @var + 1 + ; CHECK: [[ADD_LOW:%[0-9]+]]:_(p0) = G_ADD_LOW [[ADRP]](p0), target-flags(aarch64-pageoff, aarch64-nc) @var + 1 + ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[ADD_LOW]](p0) + ; CHECK: $x0 = COPY [[PTRTOINT]](s64) + ; CMLARGE-LABEL: name: test_global_with_offset + ; CMLARGE: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @var + 1 + ; CMLARGE: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[GV]](p0) + ; CMLARGE: $x0 = COPY [[PTRTOINT]](s64) + %0:_(p0) = G_GLOBAL_VALUE @var + 1 + %1:_(s64) = G_PTRTOINT %0 + $x0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-add-low.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-add-low.mir new file mode 100644 index 000000000000..2272aaf28673 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-add-low.mir @@ -0,0 +1,70 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +--- | + + @x = external hidden local_unnamed_addr global i32*, align 8 + + define void @select_add_low_without_offset() { ret void } + define void @select_add_low_with_offset() { ret void } + define void @select_add_low_without_adrp() { ret void } + +... +--- +name: select_add_low_without_offset +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: select_add_low_without_offset + ; CHECK: liveins: $x0 + ; CHECK: %add_low:gpr64 = MOVaddr target-flags(aarch64-page) @x, target-flags(aarch64-pageoff, aarch64-nc) @x + ; CHECK: $x0 = COPY %add_low + ; CHECK: RET_ReallyLR implicit $x0 + %copy:gpr(p0) = COPY $x0 + %adrp:gpr64(p0) = ADRP target-flags(aarch64-page) @x + %add_low:gpr(p0) = G_ADD_LOW %adrp(p0), target-flags(aarch64-pageoff, aarch64-nc) @x + $x0 = COPY %add_low + RET_ReallyLR implicit $x0 + +... +--- +name: select_add_low_with_offset +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: select_add_low_with_offset + ; CHECK: liveins: $x0 + ; CHECK: %add_low:gpr64 = MOVaddr target-flags(aarch64-page) @x + 1, target-flags(aarch64-pageoff, aarch64-nc) @x + 1 + ; CHECK: $x0 = COPY %add_low + ; CHECK: RET_ReallyLR implicit $x0 + %copy:gpr(p0) = COPY $x0 + %adrp:gpr64(p0) = ADRP target-flags(aarch64-page) @x + 1 + %add_low:gpr(p0) = G_ADD_LOW %adrp(p0), target-flags(aarch64-pageoff, aarch64-nc) @x + 1 + $x0 = COPY %add_low + RET_ReallyLR implicit $x0 + +... +--- +name: select_add_low_without_adrp +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: select_add_low_without_adrp + ; CHECK: liveins: $x0 + ; CHECK: %ptr:gpr64sp = COPY $x0 + ; CHECK: %add_low:gpr64sp = ADDXri %ptr, target-flags(aarch64-pageoff, aarch64-nc) @x, 0 + ; CHECK: $x0 = COPY %add_low + ; CHECK: RET_ReallyLR implicit $x0 + %ptr:gpr(p0) = COPY $x0 + %add_low:gpr(p0) = G_ADD_LOW %ptr(p0), target-flags(aarch64-pageoff, aarch64-nc) @x + $x0 = COPY %add_low + RET_ReallyLR implicit $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-with-offset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-with-offset.mir new file mode 100644 index 000000000000..7533731b2bd8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-with-offset.mir @@ -0,0 +1,38 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=aarch64 -code-model=large -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=LARGE +# RUN: llc -mtriple=aarch64 -code-model=small -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=SMALL +# RUN: llc -mtriple=aarch64 -code-model=tiny -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=TINY + +--- | + @g = external hidden global i32 + define void @select_gv_with_offset() { ret void } +... +--- +name: select_gv_with_offset +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; LARGE-LABEL: name: select_gv_with_offset + ; LARGE: liveins: $x0 + ; LARGE: [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @g + 1, 0 + ; LARGE: [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi]], target-flags(aarch64-g1, aarch64-nc) @g + 1, 16 + ; LARGE: [[MOVKXi1:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi]], target-flags(aarch64-g2, aarch64-nc) @g + 1, 32 + ; LARGE: %g:gpr64 = MOVKXi [[MOVKXi1]], target-flags(aarch64-g3) @g + 1, 48 + ; LARGE: $x0 = COPY %g + ; LARGE: RET_ReallyLR implicit $x0 + ; SMALL-LABEL: name: select_gv_with_offset + ; SMALL: liveins: $x0 + ; SMALL: %g:gpr64 = MOVaddr target-flags(aarch64-page) @g + 1, target-flags(aarch64-pageoff, aarch64-nc) @g + 1 + ; SMALL: $x0 = COPY %g + ; SMALL: RET_ReallyLR implicit $x0 + ; TINY-LABEL: name: select_gv_with_offset + ; TINY: liveins: $x0 + ; TINY: %g:gpr64 = ADR @g + 1 + ; TINY: $x0 = COPY %g + ; TINY: RET_ReallyLR implicit $x0 + %g:gpr(p0) = G_GLOBAL_VALUE @g + 1 + $x0 = COPY %g(p0) + RET_ReallyLR implicit $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir index 62c28b906dea..5bbd2a73c14e 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-store.mir @@ -41,7 +41,8 @@ @x = external hidden local_unnamed_addr global i32*, align 8 define void @store_adrp_add_low() { ret void } - + define void @store_adrp_add_low_foldable_offset() { ret void } + define void @store_adrp_add_low_unfoldable_offset() { ret void } ... --- @@ -622,3 +623,43 @@ body: | %adrp:gpr64(p0) = ADRP target-flags(aarch64-page) @x %add_low:gpr(p0) = G_ADD_LOW %adrp(p0), target-flags(aarch64-pageoff, aarch64-nc) @x G_STORE %copy(p0), %add_low(p0) :: (store 8 into @x) + +... +--- +name: store_adrp_add_low_foldable_offset +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_adrp_add_low_foldable_offset + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64all = COPY $x0 + ; CHECK: %adrp:gpr64common = ADRP target-flags(aarch64-page) @x + 8 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY %copy + ; CHECK: STRXui [[COPY]], %adrp, target-flags(aarch64-pageoff, aarch64-nc) @x + 8 :: (store 8 into @x) + %copy:gpr(p0) = COPY $x0 + %adrp:gpr64(p0) = ADRP target-flags(aarch64-page) @x + 8 + %add_low:gpr(p0) = G_ADD_LOW %adrp(p0), target-flags(aarch64-pageoff, aarch64-nc) @x + 8 + G_STORE %copy(p0), %add_low(p0) :: (store 8 into @x) + +... +--- +name: store_adrp_add_low_unfoldable_offset +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: store_adrp_add_low_unfoldable_offset + ; CHECK: liveins: $x0 + ; CHECK: %copy:gpr64all = COPY $x0 + ; CHECK: %add_low:gpr64common = MOVaddr target-flags(aarch64-page) @x + 3, target-flags(aarch64-pageoff, aarch64-nc) @x + 3 + ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY %copy + ; CHECK: STRXui [[COPY]], %add_low, 0 :: (store 8 into @x) + %copy:gpr(p0) = COPY $x0 + %adrp:gpr64(p0) = ADRP target-flags(aarch64-page) @x + 3 + %add_low:gpr(p0) = G_ADD_LOW %adrp(p0), target-flags(aarch64-pageoff, aarch64-nc) @x + 3 + G_STORE %copy(p0), %add_low(p0) :: (store 8 into @x) diff --git a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll index 40235791c524..24168f912175 100644 --- a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll +++ b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll @@ -1,69 +1,152 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s +; RUN: llc < %s -global-isel -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=GISEL @x1 = external hidden global [2 x i64] @x2 = external hidden global [16777216 x i64] @x3 = external hidden global { [9 x i8*], [8 x i8*] } define i64 @f1() { - ; CHECK: f1: - ; CHECK: adrp x8, x1+16 - ; CHECK: ldr x0, [x8, :lo12:x1+16] +; CHECK-LABEL: f1: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x1+16 +; CHECK-NEXT: ldr x0, [x8, :lo12:x1+16] +; CHECK-NEXT: ret +; +; GISEL-LABEL: f1: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, x1+16 +; GISEL-NEXT: ldr x0, [x8, :lo12:x1+16] +; GISEL-NEXT: ret %l = load i64, i64* getelementptr ([2 x i64], [2 x i64]* @x1, i64 0, i64 2) ret i64 %l } define i64 @f2() { - ; CHECK: f2: - ; CHECK: adrp x8, x1 - ; CHECK: add x8, x8, :lo12:x1 - ; CHECK: ldr x0, [x8, #24] +; CHECK-LABEL: f2: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x1 +; CHECK-NEXT: add x8, x8, :lo12:x1 +; CHECK-NEXT: ldr x0, [x8, #24] +; CHECK-NEXT: ret +; +; GISEL-LABEL: f2: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, x1 +; GISEL-NEXT: add x8, x8, :lo12:x1 +; GISEL-NEXT: ldr x0, [x8, #24] +; GISEL-NEXT: ret + %l = load i64, i64* getelementptr ([2 x i64], [2 x i64]* @x1, i64 0, i64 3) ret i64 %l } define i64 @f3() { - ; CHECK: f3: - ; CHECK: adrp x8, x1+1 - ; CHECK: add x8, x8, :lo12:x1+1 - ; CHECK: ldr x0, [x8] +; CHECK-LABEL: f3: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x1+1 +; CHECK-NEXT: add x8, x8, :lo12:x1+1 +; CHECK-NEXT: ldr x0, [x8] +; CHECK-NEXT: ret +; +; GISEL-LABEL: f3: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, x1+1 +; GISEL-NEXT: add x8, x8, :lo12:x1+1 +; GISEL-NEXT: ldr x0, [x8] +; GISEL-NEXT: ret %l = load i64, i64* bitcast (i8* getelementptr (i8, i8* bitcast ([2 x i64]* @x1 to i8*), i64 1) to i64*) ret i64 %l } define [2 x i64] @f4() { - ; CHECK: f4: - ; CHECK: adrp x8, x2+8 - ; CHECK: add x8, x8, :lo12:x2+8 - ; CHECK: ldp x0, x1, [x8] +; FIXME: GlobalISel misses the opportunity to form a LDP here. +; +; CHECK-LABEL: f4: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x2+8 +; CHECK-NEXT: add x8, x8, :lo12:x2+8 +; CHECK-NEXT: ldp x0, x1, [x8] +; CHECK-NEXT: ret +; +; GISEL-LABEL: f4: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x9, x2+8 +; GISEL-NEXT: adrp x8, x2+8 +; GISEL-NEXT: add x9, x9, :lo12:x2+8 +; GISEL-NEXT: ldr x0, [x8, :lo12:x2+8] +; GISEL-NEXT: ldr x1, [x9, #8] +; GISEL-NEXT: ret %l = load [2 x i64], [2 x i64]* bitcast (i8* getelementptr (i8, i8* bitcast ([16777216 x i64]* @x2 to i8*), i64 8) to [2 x i64]*) ret [2 x i64] %l } define i64 @f5() { - ; CHECK: f5: - ; CHECK: adrp x8, x2+2097144 - ; CHECK: ldr x0, [x8, :lo12:x2+2097144] - ; CHECK: ret +; CHECK-LABEL: f5: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x2+2097144 +; CHECK-NEXT: ldr x0, [x8, :lo12:x2+2097144] +; CHECK-NEXT: ret +; +; GISEL-LABEL: f5: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x8, x2+2097144 +; GISEL-NEXT: ldr x0, [x8, :lo12:x2+2097144] +; GISEL-NEXT: ret %l = load i64, i64* getelementptr ([16777216 x i64], [16777216 x i64]* @x2, i64 0, i64 262143) ret i64 %l } define i64 @f6() { - ; CHECK: f6: - ; CHECK: adrp x8, x2 - ; CHECK: add x8, x8, :lo12:x2 - ; CHECK: mov w9, #2097152 - ; CHECK: ldr x0, [x8, x9] - ; CHECK: ret +; CHECK-LABEL: f6: +; CHECK: // %bb.0: +; CHECK-NEXT: adrp x8, x2 +; CHECK-NEXT: add x8, x8, :lo12:x2 +; CHECK-NEXT: mov w9, #2097152 +; CHECK-NEXT: ldr x0, [x8, x9] +; CHECK-NEXT: ret +; +; GISEL-LABEL: f6: +; GISEL: // %bb.0: +; GISEL-NEXT: adrp x9, x2 +; GISEL-NEXT: mov w8, #2097152 +; GISEL-NEXT: add x9, x9, :lo12:x2 +; GISEL-NEXT: ldr x0, [x9, x8] +; GISEL-NEXT: ret %l = load i64, i64* getelementptr ([16777216 x i64], [16777216 x i64]* @x2, i64 0, i64 262144) ret i64 %l } define i32 @f7() { +; FIXME: GlobalISel doesn't handle vectors well. +; +; CHECK-LABEL: f7: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, x3+108 +; CHECK-NEXT: ldr w0, [x8, :lo12:x3+108] +; CHECK-NEXT: ret +; +; GISEL-LABEL: f7: +; GISEL: // %bb.0: // %entry +; GISEL-NEXT: adrp x8, x3+88 +; GISEL-NEXT: add x8, x8, :lo12:x3+88 +; GISEL-NEXT: mov v0.d[1], x8 +; GISEL-NEXT: mov w9, #64 +; GISEL-NEXT: mov d1, v0.d[1] +; GISEL-NEXT: sub x8, x9, #64 // =64 +; GISEL-NEXT: fmov x11, d1 +; GISEL-NEXT: fmov x10, d0 +; GISEL-NEXT: lsl x12, x11, x8 +; GISEL-NEXT: cmp x9, #64 // =64 +; GISEL-NEXT: lsr x8, x11, x8 +; GISEL-NEXT: orr x11, x12, x10, lsr #0 +; GISEL-NEXT: csel x8, x11, x8, lo +; GISEL-NEXT: cmp x9, #0 // =0 +; GISEL-NEXT: csel x8, x10, x8, eq +; GISEL-NEXT: ldr w0, [x8, #20] +; GISEL-NEXT: ret + entry: - ; CHECK: f7 - ; CHECK: adrp x8, x3+108 - ; CHECK: ldr w0, [x8, :lo12:x3+108] %l = load i32, i32* getelementptr (i32, i32* inttoptr (i64 trunc (i128 lshr (i128 bitcast (<2 x i64> to i128), i128 64) to i64) to i32*), i64 5) ret i32 %l } -- GitLab From 286a9d467ea904490548a25e3c73ad0d50190b43 Mon Sep 17 00:00:00 2001 From: Rob Suderman Date: Thu, 18 Mar 2021 16:14:05 -0700 Subject: [PATCH 0096/1000] [mlir][tosa] Add lowering for tosa.rescale to linalg.generic This adds a tosa.apply_scale operation that handles the scaling operation common to quantized operatons. This scalar operation is lowered in TosaToStandard. We use a separate ApplyScale factorization as this is a replicable pattern within TOSA. ApplyScale can be reused within pool/convolution/mul/matmul for their quantized variants. Tests are added to both tosa-to-standard and tosa-to-linalg-on-tensors that verify each pass is correct. Reviewed By: silvas Differential Revision: https://reviews.llvm.org/D98753 --- .../TosaToStandard/TosaToStandard.h | 3 + mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 24 ++ .../mlir/Dialect/Tosa/IR/TosaTypesBase.td | 15 ++ .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 207 +++++++++++++++--- .../TosaToLinalg/TosaToLinalgPass.cpp | 7 + .../TosaToStandard/TosaToStandard.cpp | 108 ++++++++- .../TosaToStandard/TosaToStandardPass.cpp | 1 + .../TosaToLinalg/tosa-to-linalg.mlir | 51 +++++ .../TosaToStandard/tosa-to-standard.mlir | 38 +++- 9 files changed, 424 insertions(+), 30 deletions(-) diff --git a/mlir/include/mlir/Conversion/TosaToStandard/TosaToStandard.h b/mlir/include/mlir/Conversion/TosaToStandard/TosaToStandard.h index 82555003661e..5a63d787b38a 100644 --- a/mlir/include/mlir/Conversion/TosaToStandard/TosaToStandard.h +++ b/mlir/include/mlir/Conversion/TosaToStandard/TosaToStandard.h @@ -23,6 +23,9 @@ std::unique_ptr createTosaToStandard(); void populateTosaToStandardConversionPatterns( MLIRContext *context, OwningRewritePatternList *patterns); +void populateTosaRescaleToStandardConversionPatterns( + MLIRContext *context, OwningRewritePatternList *patterns); + /// Populates passes to convert from TOSA to Standard. void addTosaToStandardPasses(OpPassManager &pm); diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td index c9790596ed88..576471562bf3 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td @@ -1494,6 +1494,30 @@ def Tosa_RescaleOp: Tosa_Op<"rescale", [NoSideEffect]> { ); } +def Tosa_ApplyScaleOp: Tosa_Op<"apply_scale", [NoSideEffect] # ElementwiseMappable.traits> { + let summary = "Rescale scalar operator for Tosa tensor operators"; + + let description = [{ + Applies rescaling for fixed point values. This behavior is replicated in + multiple quantized operations (mul, convolution, rescale, matmul, pooling). + + The commonplace implementation is to use i64 operations to avoid integer + overflow with target specific implementations can use native operations to + avoid wider than necessary types. + }]; + + let arguments = (ins + Tosa_Int32Like:$value, + Tosa_Int32Like:$multiplier, + Tosa_Int8Like:$shift, + BoolAttr:$double_round + ); + + let results = (outs + Tosa_Int32:$output + ); +} + //===----------------------------------------------------------------------===// // TOSA Spec Section 2.13 // Operator Class: Data Node Ops. diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td index fe3eee7168c6..64314f06aac2 100644 --- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td +++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td @@ -127,6 +127,21 @@ def Tosa_Tensor1Dto6D : TensorRankOf<[Tosa_AnyNumber], [1,2,3,4,5,6]>; def Tosa_TensorUpto4D : TensorRankOf<[Tosa_AnyNumber], [0,1,2,3,4]>; def Tosa_TensorUpto6D : TensorRankOf<[Tosa_AnyNumber], [0,1,2,3,4,5,6]>; +//===----------------------------------------------------------------------===// +// Generic scalar, vector, or tensor of a particular type. +//===----------------------------------------------------------------------===// + +class Tosa_TypeLike types, string description = ""> : TypeConstraint.predicate, + VectorOf.predicate, + TensorOf.predicate]>, + "signless-integer-32-like">; + +def Tosa_Int8Like : Tosa_TypeLike<[Tosa_Int8], "signless-integer-8-bit-like">; +def Tosa_Int16Like : Tosa_TypeLike<[Tosa_Int16], "signless-integer-16-bit-like">; +def Tosa_Int32Like : Tosa_TypeLike<[Tosa_Int32], "signless-integer-32-bit-like">; +def Tosa_Int64Like : Tosa_TypeLike<[Tosa_Int64], "signless-integer-64-bit-like">; + //===----------------------------------------------------------------------===// // Attribute predicates and classes. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index dd2725cbd0fa..5db47b423d89 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -32,22 +32,49 @@ static SmallVector getNParallelLoopsAttrs(unsigned nParallelLoops) { template static mlir::ConstantOp createConstFromIntAttribute(Operation *op, std::string attrName, - Type requiredAttrType, PatternRewriter &rewriter) { + Type requiredAttrType, OpBuilder &rewriter) { auto castedN = static_cast( op->getAttr(attrName).cast().getValue().getSExtValue()); return rewriter.create( op->getLoc(), IntegerAttr::get(requiredAttrType, castedN)); } +template +static void getValuesFromIntArrayAttribute(ArrayAttr attr, + SmallVector &arrayValues) { + for (Attribute val : attr.getValue()) { + arrayValues.push_back(val.cast().getValue().getSExtValue()); + } +} + +// Generates an affine map for parallel operations on a given type. This +// performs implicit broadcasting across any dimension of size-1. +static AffineMap createAffineMapForType(ShapedType type, + PatternRewriter &rewriter) { + unsigned rank = type.getRank(); + auto shape = type.getShape(); + SmallVector dimExprs; + dimExprs.reserve(rank); + for (unsigned i = 0; i < rank; ++i) { + // If the dimension is one we can broadcast the input with a constant + // affine expression. + if (shape[i] == 1) + dimExprs.push_back(rewriter.getAffineConstantExpr(0)); + else + dimExprs.push_back(rewriter.getAffineDimExpr(i)); + } + return AffineMap::get(/*dimCount=*/rank, /*symbolCount=*/0, dimExprs, + rewriter.getContext()); +} + template -static mlir::SelectOp clampHelper(Operation *op, ValueRange args, - mlir::ConstantOp min, mlir::ConstantOp max, - P pred, PatternRewriter &rewriter) { - Location loc = op->getLoc(); - auto smallerThanMin = rewriter.create(loc, pred, args[0], min); +static mlir::SelectOp clampHelper(Location loc, Value arg, mlir::ConstantOp min, + mlir::ConstantOp max, P pred, + OpBuilder &rewriter) { + auto smallerThanMin = rewriter.create(loc, pred, arg, min); auto minOrArg = - rewriter.create(loc, smallerThanMin, min, args[0]); - auto largerThanMax = rewriter.create(loc, pred, max, args[0]); + rewriter.create(loc, smallerThanMin, min, arg); + auto largerThanMax = rewriter.create(loc, pred, max, arg); return rewriter.create(loc, largerThanMax, max, minOrArg); } @@ -211,7 +238,7 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, op->getAttr("min_fp")); auto max = rewriter.create(loc, elementTy, op->getAttr("max_fp")); - return clampHelper(op, args, min, max, CmpFPredicate::OLT, + return clampHelper(loc, args[0], min, max, CmpFPredicate::OLT, rewriter); } @@ -220,7 +247,7 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, rewriter); auto max = createConstFromIntAttribute(op, "max_int", elementTy, rewriter); - return clampHelper(op, args, min, max, CmpIPredicate::slt, + return clampHelper(loc, args[0], min, max, CmpIPredicate::slt, rewriter); } @@ -230,7 +257,7 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, rewriter.create(loc, FloatAttr::get(elementTy, 0)); auto n = rewriter.create(loc, elementTy, op->getAttr("max_fp")); - return clampHelper(op, args, zero, n, CmpFPredicate::OLT, + return clampHelper(loc, args[0], zero, n, CmpFPredicate::OLT, rewriter); } @@ -239,7 +266,7 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, rewriter.create(loc, IntegerAttr::get(elementTy, 0)); auto n = createConstFromIntAttribute(op, "max_int", elementTy, rewriter); - return clampHelper(op, args, zero, n, CmpIPredicate::slt, + return clampHelper(loc, args[0], zero, n, CmpIPredicate::slt, rewriter); } @@ -290,21 +317,9 @@ elementwiseMatchAndRewriteHelper(Operation *operation, indexingMaps.reserve(operation->getNumOperands() + bodyResultTypes.size()); // Input indexing maps may be broadcasted. - for (Type types : operation->getOperandTypes()) { - auto shape = types.cast().getShape(); - SmallVector dimExprs; - dimExprs.reserve(nloops); - for (unsigned i = 0; i < nloops; ++i) { - // If the dimension is one we can broadcast the input with a constant - // affine expression. - if (shape[i] == 1) - dimExprs.push_back(rewriter.getAffineConstantExpr(0)); - else - dimExprs.push_back(rewriter.getAffineDimExpr(i)); - } - indexingMaps.push_back(AffineMap::get(/*dimCount=*/nloops, - /*symbolCount=*/0, dimExprs, - rewriter.getContext())); + for (Type type : operation->getOperandTypes()) { + indexingMaps.push_back( + createAffineMapForType(type.cast(), rewriter)); } indexingMaps.append(operation->getNumResults(), @@ -632,6 +647,142 @@ public: } }; +class RescaleOpConverter : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::RescaleOp op, + PatternRewriter &rewriter) const final { + auto loc = op.getLoc(); + auto input = op.input(); + auto inputTy = op.input().getType().cast(); + auto outputTy = op.output().getType().cast(); + unsigned rank = inputTy.getRank(); + + if (!outputTy.hasStaticShape()) + return rewriter.notifyMatchFailure( + op, "tosa to linalg conversion expects statically shaped tensors"); + + // The shift and multiplier values. + SmallVector multiplierValues; + getValuesFromIntArrayAttribute(op.multiplier(), multiplierValues); + + SmallVector shiftValues; + getValuesFromIntArrayAttribute(op.shift(), shiftValues); + + // Double round only occurs if shift is greater than 31, check that this + // is ever true. + bool doubleRound = + op.double_round() && + llvm::any_of(shiftValues, [](int32_t v) { return v > 31; }); + + // We need to broadcast along the last dimension, so make all dims 1. + SmallVector multiplierShape; + multiplierShape.resize(rank, 1); + + SmallVector shiftShape; + shiftShape.resize(rank, 1); + + // Set the channel dimension to match the number of shift/broadcast + // channels. + if (!multiplierShape.empty()) + multiplierShape.back() = multiplierValues.size(); + if (!shiftShape.empty()) + shiftShape.back() = shiftValues.size(); + + // Create the tensor types. + auto multiplierType = + RankedTensorType::get(multiplierShape, rewriter.getI32Type()); + auto shiftType = + RankedTensorType::get(shiftShape, rewriter.getIntegerType(8)); + + auto multiplierConst = rewriter.create( + loc, DenseIntElementsAttr::get(multiplierType, multiplierValues)); + + auto shiftConst = rewriter.create( + loc, DenseIntElementsAttr::get(shiftType, shiftValues)); + + // Construct the indexing maps needed for linalg.generic ops. + SmallVector bodyArgTypes = {getElementTypeOrSelf(inputTy), + rewriter.getI32Type(), + rewriter.getI32Type()}; + Value initTensor = rewriter.create( + loc, ArrayRef({}), outputTy.getShape(), + outputTy.getElementType()); + + SmallVector indexingMaps; + + // Indexing map for input values. + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(rank)); + + // Shift and multiplier will need to broadcast across their non channel + // values. + indexingMaps.push_back(createAffineMapForType(multiplierType, rewriter)); + indexingMaps.push_back(createAffineMapForType(shiftType, rewriter)); + + // Indexing maps for output values. + indexingMaps.push_back(rewriter.getMultiDimIdentityMap(rank)); + + auto linalgOp = rewriter.create( + loc, outputTy, ValueRange{input, multiplierConst, shiftConst}, + ValueRange{initTensor}, indexingMaps, getNParallelLoopsAttrs(rank), + [&](OpBuilder &nestedBuilder, Location nestedLoc, + ValueRange blockArgs) { + // For now we do all of our math in 64-bit. This is not optimal but + // should be correct for now, consider computing correct bit depth + // later. + auto inputZp = createConstFromIntAttribute( + op, "input_zp", nestedBuilder.getI32Type(), nestedBuilder); + auto outputZp = createConstFromIntAttribute( + op, "output_zp", nestedBuilder.getI32Type(), nestedBuilder); + + Value value = blockArgs[0]; + Value multiplier = blockArgs[1]; + Value shift = blockArgs[2]; + + if (value.getType().getIntOrFloatBitWidth() < 32) { + value = nestedBuilder.create( + nestedLoc, nestedBuilder.getI32Type(), value); + } + + value = nestedBuilder.create(nestedLoc, value, inputZp); + + value = nestedBuilder.create( + loc, nestedBuilder.getI32Type(), value, multiplier, shift, + nestedBuilder.getBoolAttr(doubleRound)); + + // Move to the new zero-point. + value = nestedBuilder.create(nestedLoc, value, outputZp); + + // Saturate to the output size. + IntegerType outIntType = + blockArgs.back().getType().cast(); + unsigned outBitWidth = outIntType.getWidth(); + auto intMin = nestedBuilder.create( + loc, nestedBuilder.getIntegerAttr( + nestedBuilder.getI32Type(), + APInt::getSignedMinValue(outBitWidth).getSExtValue())); + auto intMax = nestedBuilder.create( + loc, nestedBuilder.getIntegerAttr( + nestedBuilder.getI32Type(), + APInt::getSignedMaxValue(outBitWidth).getSExtValue())); + + value = clampHelper(nestedLoc, value, intMin, intMax, + CmpIPredicate::slt, nestedBuilder); + + if (outIntType.getWidth() < 32) { + value = + nestedBuilder.create(nestedLoc, outIntType, value); + } + + nestedBuilder.create(loc, value); + }); + + rewriter.replaceOp(op, linalgOp->getResults()); + return success(); + } +}; + // At the codegen level any identity operations should be removed. Any cases // where identity is load-bearing (e.g. cross device computation) should be // handled before lowering to codegen. @@ -729,5 +880,5 @@ void mlir::tosa::populateTosaToLinalgOnTensorsConversionPatterns( IdentityNConverter, ReduceConverter, ReduceConverter, ReduceConverter, ReduceConverter, ConcatOpConversion, - ReshapeOpConverter, TransposeConverter>(context); + ReshapeOpConverter, TransposeConverter, RescaleOpConverter>(context); } diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp index a1bd694f67af..e0f1369b43a5 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp @@ -42,6 +42,13 @@ public: target.addLegalDialect(); target.addIllegalDialect(); + + // Not every TOSA op can be legalized to linalg. + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.addLegalOp(); + target.markUnknownOpDynamicallyLegal([](Operation *) { return true; }); FuncOp func = getFunction(); diff --git a/mlir/lib/Conversion/TosaToStandard/TosaToStandard.cpp b/mlir/lib/Conversion/TosaToStandard/TosaToStandard.cpp index 6e5411dd5ecb..95f5c51ff1f0 100644 --- a/mlir/lib/Conversion/TosaToStandard/TosaToStandard.cpp +++ b/mlir/lib/Conversion/TosaToStandard/TosaToStandard.cpp @@ -46,7 +46,107 @@ public: sliceOp, sliceOp.getType(), input, ValueRange({}), ValueRange({}), ValueRange({}), sliceOp.start(), sliceOp.size(), rewriter.getI64ArrayAttr(strides)); + return success(); + } +}; + +// This converts the TOSA ApplyScale operator to a set of StandardOps ops, +// using 64-bit operations to perform the necessary multiply, bias, and shift. +// Multiple types are used to use minimal bit width operations. +class ApplyScaleOpConverter : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::ApplyScaleOp op, + PatternRewriter &rewriter) const final { + Location loc = op.getLoc(); + Value value32 = op.value(); + Value multiplier32 = op.multiplier(); + Value shift8 = op.shift(); + bool doubleRound = op.double_round(); + + Value one8 = rewriter.create( + loc, rewriter.getIntegerAttr(rewriter.getIntegerType(8), 1)); + Value one32 = rewriter.create( + loc, rewriter.getIntegerAttr(rewriter.getI32Type(), 1)); + Value one64 = rewriter.create( + loc, rewriter.getIntegerAttr(rewriter.getI64Type(), 1)); + + Value shiftSubOne8 = rewriter.create(loc, shift8, one8); + + // The rounding value semantics below equate to the following code: + // int64_t round = 1 << (shift - 1); + // if (double_round) { + // if (shift > 31 && value >= 0) round += 1<<30; + // if (shift > 31 && value < 0) round -= 1<<30; + // } + // + // Note that minimal bitwidth operators are used throughout the block. + + Value shift32 = rewriter.create( + loc, rewriter.getI32Type(), shift8); + + Value round64 = rewriter.create( + loc, one64, + rewriter.create(loc, rewriter.getI64Type(), + shiftSubOne8)); + + // Double rounding is performing a round operation before the shift + if (doubleRound) { + Value zero32 = rewriter.create( + loc, rewriter.getZeroAttr(rewriter.getI32Type())); + Value thirty32 = rewriter.create( + loc, rewriter.getIntegerAttr(rewriter.getI32Type(), 30)); + Value shiftThirty32 = + rewriter.create(loc, one32, thirty32); + Value shiftThirty64 = rewriter.create( + loc, rewriter.getI64Type(), shiftThirty32); + + // Round value needs to with be added or sbustracted depending on + Value roundAdd64 = + rewriter.create(loc, round64, shiftThirty64); + Value roundSub64 = + rewriter.create(loc, round64, shiftThirty64); + + Value valueGreaterThanZero = rewriter.create( + loc, CmpIPredicate::sge, value32, zero32); + + Value doubleRound64 = rewriter.create( + loc, valueGreaterThanZero, roundAdd64, roundSub64); + + // We only perform double rounding if the shift value is greater than 32. + Value thirtyTwo32 = rewriter.create( + loc, rewriter.getIntegerAttr(rewriter.getI32Type(), 32)); + Value shiftGreaterThanThirtyTwo = rewriter.create( + loc, CmpIPredicate::sge, shift32, thirtyTwo32); + round64 = rewriter.create(loc, shiftGreaterThanThirtyTwo, + doubleRound64, round64); + } + + // The computation below equates to the following pseudocode: + // int64_t result = (int64_t)value * multiplier + round; + // result = result >> shift; + // + // Note that multiply and shift need to be perform in i64 to preserve bits. + + Value value64 = + rewriter.create(loc, rewriter.getI64Type(), value32); + Value multiplier64 = rewriter.create( + loc, rewriter.getI64Type(), multiplier32); + Value shift64 = + rewriter.create(loc, rewriter.getI64Type(), shift8); + + // Multiply as a pair of i64 values to guarantee the end value fits. + Value result64 = rewriter.create(loc, value64, multiplier64); + result64 = rewriter.create(loc, result64, round64); + result64 = + rewriter.create(loc, result64, shift64); + + Value result32 = rewriter.create( + loc, rewriter.getI32Type(), result64); + + rewriter.replaceOp(op, result32); return success(); } }; @@ -55,5 +155,11 @@ public: void mlir::tosa::populateTosaToStandardConversionPatterns( MLIRContext *context, OwningRewritePatternList *patterns) { - patterns->insert(context); + patterns->insert( + context); +} + +void mlir::tosa::populateTosaRescaleToStandardConversionPatterns( + MLIRContext *context, OwningRewritePatternList *patterns) { + patterns->insert(context); } diff --git a/mlir/lib/Conversion/TosaToStandard/TosaToStandardPass.cpp b/mlir/lib/Conversion/TosaToStandard/TosaToStandardPass.cpp index 78a0e65da81b..14c800e2f70d 100644 --- a/mlir/lib/Conversion/TosaToStandard/TosaToStandardPass.cpp +++ b/mlir/lib/Conversion/TosaToStandard/TosaToStandardPass.cpp @@ -33,6 +33,7 @@ public: ConversionTarget target(getContext()); target.addIllegalOp(); target.addIllegalOp(); + target.addIllegalOp(); target.addLegalDialect(); auto *op = getOperation(); diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index 9b1f6054ee06..1714f140dbfc 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -473,3 +473,54 @@ func @concat(%arg0: tensor<5x1xf32>, %arg1: tensor<6x1xf32>) -> () { %1 = "tosa.concat"(%arg0, %arg0) { axis = 1 : i64} : (tensor<5x1xf32>, tensor<5x1xf32>) -> (tensor<5x2xf32>) return } + +// ----- + +// CHECK: #[[$MAP0:.*]] = affine_map<(d0) -> (d0)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (0)> + +// CHECK-LABEL: @rescale +func @rescale(%arg0 : tensor<1xi8>) -> (tensor<1xi8>) { + // CHECK: [[C0:%.+]] = constant dense<19689> + // CHECK: [[C1:%.+]] = constant dense<15> + // CHECK: [[INIT:%.+]] = linalg.init_tensor [1] + // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP1]], #[[$MAP0]]], iterator_types = ["parallel"]} ins(%arg0, [[C0]], [[C1]] : tensor<1xi8>, tensor<1xi32>, tensor<1xi8>) outs([[INIT]] : tensor<1xi8>) + // CHECK: ^bb0([[IN:%.+]]: i8, [[MULTIPLIER:%.+]]: i32, [[SHIFT:%.+]]: i8, [[UNUSED:%.+]]: i8): + // CHECK: [[C243:%.+]] = constant 243 + // CHECK: [[C252:%.+]] = constant 252 + + // CHECK-DAG: [[IN32:%.+]] = sexti [[IN]] + // CHECK-DAG: [[IN_ZEROED:%.+]] = subi [[IN32]], [[C243]] + // CHECK-DAG: [[SCALED:%.+]] = "tosa.apply_scale"([[IN_ZEROED]], [[MULTIPLIER]], [[SHIFT]]) {double_round = false} + // CHECK-DAG: [[SCALED_ZEROED:%.+]] = addi [[SCALED]], [[C252]] + // CHECK-DAG: [[CMIN:%.+]] = constant -128 + // CHECK-DAG: [[CMAX:%.+]] = constant 127 + // CHECK-DAG: [[MINLT:%.+]] = cmpi slt, [[SCALED_ZEROED]], [[CMIN]] + // CHECK-DAG: [[MAXLT:%.+]] = cmpi slt, [[CMAX]], [[SCALED_ZEROED]] + // CHECK-DAG: [[LOWER:%.+]] = select [[MINLT]], [[CMIN]], [[SCALED_ZEROED]] + // CHECK-DAG: [[BOUNDED:%.+]] = select [[MAXLT]], [[CMAX]], [[LOWER]] + // CHECK-DAG: [[TRUNC:%.+]] = trunci [[BOUNDED]] + // CHECK-DAG: linalg.yield [[TRUNC]] + %0 = "tosa.rescale"(%arg0) {input_zp = 243 : i32, output_zp = 252 : i32, multiplier = [19689 : i32], shift = [15 : i32], scale32 = false, double_round = false, per_channel = false} : (tensor<1xi8>) -> (tensor<1xi8>) + + // CHECK: return [[GENERIC]] + return %0 : tensor<1xi8> +} + +// CHECK-LABEL: @rescaleDoubleRound +func @rescaleDoubleRound(%arg0 : tensor<1xi8>) -> (tensor<1xi8>) { + // CHECK: linalg.generic + // CHECK: "tosa.apply_scale" + // CHECK-SAME: {double_round = true} + %0 = "tosa.rescale"(%arg0) {input_zp = 243 : i32, output_zp = 252 : i32, multiplier = [19689 : i32], shift = [33 : i32], scale32 = true, double_round = true, per_channel = false} : (tensor<1xi8>) -> (tensor<1xi8>) + return %0 : tensor<1xi8> +} + +// CHECK-LABEL: @rescaleUnnecessaryDoubleRound +func @rescaleUnnecessaryDoubleRound(%arg0 : tensor<1xi8>) -> (tensor<1xi8>) { + // CHECK: linalg.generic + // CHECK: "tosa.apply_scale" + // CHECK-SAME: {double_round = false} + %0 = "tosa.rescale"(%arg0) {input_zp = 243 : i32, output_zp = 252 : i32, multiplier = [19689 : i32], shift = [15 : i32], scale32 = true, double_round = true, per_channel = false} : (tensor<1xi8>) -> (tensor<1xi8>) + return %0 : tensor<1xi8> +} diff --git a/mlir/test/Conversion/TosaToStandard/tosa-to-standard.mlir b/mlir/test/Conversion/TosaToStandard/tosa-to-standard.mlir index 94925aec15c7..2c80c31cf297 100644 --- a/mlir/test/Conversion/TosaToStandard/tosa-to-standard.mlir +++ b/mlir/test/Conversion/TosaToStandard/tosa-to-standard.mlir @@ -9,10 +9,46 @@ func @const_test() -> (tensor) { return %0 : tensor } -// ---- +// ----- func @slice(%arg0: tensor<6xf32>) ->() { // CHECK: [[SLICE:%.+]] = subtensor %arg0[2] [1] [1] %0 = "tosa.slice"(%arg0) {start = [2], size = [1]} : (tensor<6xf32>) -> (tensor<1xf32>) return } + +// ----- + +func @apply_scale_test(%arg0 : i32, %arg1 : i32, %arg2 : i8) -> (i32) { + // CHECK: [[C1_8:%.+]] = constant 1 : i8 + // CHECK: [[C1_32:%.+]] = constant 1 : i32 + // CHECK: [[C1_64:%.+]] = constant 1 : i64 + // CHECK: [[SHIFT_MINUS_ONE_8:%.+]] = subi %arg2, [[C1_8]] + + // CHECK: [[SHIFT_32:%.+]] = sexti %arg2 : i8 to i32 + // CHECK: [[SHIFT_MINUS_ONE_64:%.+]] = sexti [[SHIFT_MINUS_ONE_8]] : i8 to i64 + // CHECK: [[SHIFTED_64:%.+]] = shift_left [[C1_64]], [[SHIFT_MINUS_ONE_64]] + + // CHECK: [[C0_32:%.+]] = constant 0 : i32 + // CHECK: [[C30_32:%.+]] = constant 30 : i32 + // CHECK: [[SECOND_BIAS:%.+]] = shift_left [[C1_32]], [[C30_32]] + // CHECK: [[SECOND_BIAS_64:%.+]] = sexti [[SECOND_BIAS]] : i32 to i64 + // CHECK: [[POSITIVE_ROUND:%.+]] = addi [[SHIFTED_64]], [[SECOND_BIAS_64]] + // CHECK: [[NEGATIVE_ROUND:%.+]] = subi [[SHIFTED_64]], [[SECOND_BIAS_64]] + // CHECK: [[VALUE_NEGATIVE:%.+]] = cmpi sge, %arg0, [[C0_32]] : i32 + // CHECK: [[DOUBLE_ROUNDED:%.+]] = select [[VALUE_NEGATIVE]], [[POSITIVE_ROUND]], [[NEGATIVE_ROUND]] : i64 + // CHECK: [[C32_32:%.+]] = constant 32 : i32 + // CHECK: [[IS_32BIT_SHIFT:%.+]] = cmpi sge, [[SHIFT_32]], [[C32_32]] + // CHECK: [[ROUND:%.+]] = select [[IS_32BIT_SHIFT]], [[DOUBLE_ROUNDED]], [[SHIFTED_64]] + + // CHECK: [[VAL_64:%.+]] = sexti %arg0 : i32 to i64 + // CHECK: [[MULTIPLY_64:%.+]] = sexti %arg1 : i32 to i64 + // CHECK: [[SHIFT_64:%.+]] = sexti %arg2 : i8 to i64 + // CHECK: [[SCALED:%.+]] = muli [[VAL_64]], [[MULTIPLY_64]] + // CHECK: [[BIASED:%.+]] = addi [[SCALED]], [[ROUND]] + // CHECK: [[DOWNSHIFTED:%.+]] = shift_right_signed [[BIASED]], [[SHIFT_64]] + // CHECK: [[TRUNCATED:%.+]] = trunci [[DOWNSHIFTED]] + + %0 = "tosa.apply_scale"(%arg0, %arg1, %arg2) {double_round = true} : (i32, i32, i8) -> i32 + return %0 : i32 +} -- GitLab From d10f173f34baa139c4e85be96ff1750d6d689c8e Mon Sep 17 00:00:00 2001 From: George Balatsouras Date: Tue, 16 Mar 2021 12:05:15 -0700 Subject: [PATCH 0097/1000] [dfsan] Add -dfsan-fast-8-labels flag This is only adding support to the dfsan instrumentation pass but not to the runtime. Added more RUN lines for testing: for each instrumentation test that had a -dfsan-fast-16-labels invocation, a new invocation was added using fast8. Reviewed By: stephan.yichao.zhao Differential Revision: https://reviews.llvm.org/D98734 --- .../Instrumentation/DataFlowSanitizer.cpp | 200 +++++++++++---- .../DataFlowSanitizer/abilist.ll | 2 + .../DataFlowSanitizer/abilist_aggregate.ll | 1 + .../DataFlowSanitizer/array.ll | 227 +++++++++--------- .../DataFlowSanitizer/atomics.ll | 1 + .../DataFlowSanitizer/basic.ll | 1 + .../Instrumentation/DataFlowSanitizer/call.ll | 1 + .../DataFlowSanitizer/external_mask.ll | 1 + .../DataFlowSanitizer/fast16labels.ll | 25 +- .../Instrumentation/DataFlowSanitizer/phi.ll | 3 +- .../DataFlowSanitizer/select.ll | 2 + .../DataFlowSanitizer/shadow-args-zext.ll | 1 + .../DataFlowSanitizer/store.ll | 2 + .../DataFlowSanitizer/struct.ll | 163 +++++++------ .../DataFlowSanitizer/vector.ll | 1 + 15 files changed, 396 insertions(+), 235 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index a16ae68925a5..63b8db7916a0 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -16,9 +16,38 @@ /// issues within their own code. /// /// The analysis is based on automatic propagation of data flow labels (also -/// known as taint labels) through a program as it performs computation. Each -/// byte of application memory is backed by two bytes of shadow memory which -/// hold the label. On Linux/x86_64, memory is laid out as follows: +/// known as taint labels) through a program as it performs computation. +/// +/// There are two possible memory layouts. In the first one, each byte of +/// application memory is backed by a shadow memory byte. The shadow byte can +/// represent up to 8 labels. To enable this you must specify the +/// -dfsan-fast-8-labels flag. On Linux/x86_64, memory is then laid out as +/// follows: +/// +/// +--------------------+ 0x800000000000 (top of memory) +/// | application memory | +/// +--------------------+ 0x700000008000 (kAppAddr) +/// | | +/// | unused | +/// | | +/// +--------------------+ 0x300200000000 (kUnusedAddr) +/// | union table | +/// +--------------------+ 0x300000000000 (kUnionTableAddr) +/// | origin | +/// +--------------------+ 0x200000008000 (kOriginAddr) +/// | shadow memory | +/// +--------------------+ 0x100000008000 (kShadowAddr) +/// | unused | +/// +--------------------+ 0x000000010000 +/// | reserved by kernel | +/// +--------------------+ 0x000000000000 +/// +/// +/// In the second memory layout, each byte of application memory is backed by +/// two bytes of shadow memory which hold the label. That means we can represent +/// either 16 labels (with -dfsan-fast-16-labels flag) or 2^16 labels (on the +/// default legacy mode) per byte. On Linux/x86_64, memory is then laid out as +/// follows: /// /// +--------------------+ 0x800000000000 (top of memory) /// | application memory | @@ -36,6 +65,7 @@ /// | reserved by kernel | /// +--------------------+ 0x000000000000 /// +/// /// To derive a shadow memory address from an application memory address, /// bits 44-46 are cleared to bring the address into the range /// [0x000000008000,0x100000000000). Then the address is shifted left by 1 to @@ -200,6 +230,14 @@ static cl::opt ClFast16Labels( "labels to 16."), cl::Hidden, cl::init(false)); +// Use a distinct bit for each base label, enabling faster unions with less +// instrumentation. Limits the max number of base labels to 8. +static cl::opt ClFast8Labels( + "dfsan-fast-8-labels", + cl::desc("Use more efficient instrumentation, limiting the number of " + "labels to 8."), + cl::Hidden, cl::init(false)); + // Controls whether the pass tracks the control flow of select instructions. static cl::opt ClTrackSelectControlFlow( "dfsan-track-select-control-flow", @@ -341,8 +379,6 @@ class DataFlowSanitizer { friend class DFSanVisitor; enum { - ShadowWidthBits = 16, - ShadowWidthBytes = ShadowWidthBits / 8, OriginWidthBits = 32, OriginWidthBytes = OriginWidthBits / 8 }; @@ -383,6 +419,9 @@ class DataFlowSanitizer { WK_Custom }; + unsigned ShadowWidthBits; + unsigned ShadowWidthBytes; + Module *Mod; LLVMContext *Ctx; Type *Int8Ptr; @@ -419,7 +458,7 @@ class DataFlowSanitizer { FunctionCallee DFSanUnionFn; FunctionCallee DFSanCheckedUnionFn; FunctionCallee DFSanUnionLoadFn; - FunctionCallee DFSanUnionLoadFast16LabelsFn; + FunctionCallee DFSanUnionLoadFastLabelsFn; FunctionCallee DFSanLoadLabelAndOriginFn; FunctionCallee DFSanUnimplementedFn; FunctionCallee DFSanSetLabelFn; @@ -442,6 +481,7 @@ class DataFlowSanitizer { Value *getShadowOffset(Value *Addr, IRBuilder<> &IRB); Value *getShadowAddress(Value *Addr, Instruction *Pos); + Value *getShadowAddress(Value *Addr, Instruction *Pos, Value *ShadowOffset); std::pair getShadowOriginAddress(Value *Addr, Align InstAlignment, Instruction *Pos); bool isInstrumented(const Function *F); @@ -462,6 +502,9 @@ class DataFlowSanitizer { bool init(Module &M); + /// Returns whether fast8 or fast16 mode has been specified. + bool hasFastLabelsEnabled(); + /// Returns whether the pass tracks origins. Support only fast16 mode in TLS /// ABI mode. bool shouldTrackOrigins(); @@ -733,6 +776,14 @@ private: DataFlowSanitizer::DataFlowSanitizer( const std::vector &ABIListFiles) { + if (ClFast8Labels && ClFast16Labels) { + report_fatal_error( + "cannot set both -dfsan-fast-8-labels and -dfsan-fast-16-labels"); + } + + ShadowWidthBits = ClFast8Labels ? 8 : 16; + ShadowWidthBytes = ShadowWidthBits / 8; + std::vector AllABIListFiles(std::move(ABIListFiles)); llvm::append_range(AllABIListFiles, ClABIListFiles); // FIXME: should we propagate vfs::FileSystem to this constructor? @@ -827,6 +878,11 @@ bool DataFlowSanitizer::isZeroShadow(Value *V) { return isa(V); } +bool DataFlowSanitizer::hasFastLabelsEnabled() { + static const bool HasFastLabelsEnabled = ClFast8Labels || ClFast16Labels; + return HasFastLabelsEnabled; +} + bool DataFlowSanitizer::shouldTrackOrigins() { static const bool ShouldTrackOrigins = ClTrackOrigins && getInstrumentedABI() == DataFlowSanitizer::IA_TLS && @@ -835,7 +891,8 @@ bool DataFlowSanitizer::shouldTrackOrigins() { } bool DataFlowSanitizer::shouldTrackFieldsAndIndices() { - return getInstrumentedABI() == DataFlowSanitizer::IA_TLS && ClFast16Labels; + return getInstrumentedABI() == DataFlowSanitizer::IA_TLS && + hasFastLabelsEnabled(); } Constant *DataFlowSanitizer::getZeroShadow(Type *OrigTy) { @@ -1000,11 +1057,15 @@ bool DataFlowSanitizer::init(Module &M) { switch (TargetTriple.getArch()) { case Triple::x86_64: - ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); + ShadowPtrMask = ClFast8Labels + ? ConstantInt::getSigned(IntptrTy, ~0x600000000000LL) + : ConstantInt::getSigned(IntptrTy, ~0x700000000000LL); break; case Triple::mips64: case Triple::mips64el: - ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL); + ShadowPtrMask = ClFast8Labels + ? ConstantInt::getSigned(IntptrTy, ~0xE000000000LL) + : ConstantInt::getSigned(IntptrTy, ~0xF000000000LL); break; case Triple::aarch64: case Triple::aarch64_be: @@ -1238,7 +1299,7 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) { Attribute::ReadOnly); AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex, Attribute::ZExt); - DFSanUnionLoadFast16LabelsFn = Mod->getOrInsertFunction( + DFSanUnionLoadFastLabelsFn = Mod->getOrInsertFunction( "__dfsan_union_load_fast16labels", DFSanUnionLoadFnTy, AL); } { @@ -1290,7 +1351,7 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) { DFSanRuntimeFunctions.insert( DFSanUnionLoadFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( - DFSanUnionLoadFast16LabelsFn.getCallee()->stripPointerCasts()); + DFSanUnionLoadFastLabelsFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( DFSanLoadLabelAndOriginFn.getCallee()->stripPointerCasts()); DFSanRuntimeFunctions.insert( @@ -1757,8 +1818,7 @@ DataFlowSanitizer::getShadowOriginAddress(Value *Addr, Align InstAlignment, // Returns ((Addr & shadow_mask) + origin_base) & ~4UL IRBuilder<> IRB(Pos); Value *ShadowOffset = getShadowOffset(Addr, IRB); - Value *ShadowPtr = IRB.CreateIntToPtr( - IRB.CreateMul(ShadowOffset, ShadowPtrMul), PrimitiveShadowPtrTy); + Value *ShadowPtr = getShadowAddress(Addr, Pos, ShadowOffset); Value *OriginPtr = nullptr; if (shouldTrackOrigins()) { Value *OriginLong = IRB.CreateAdd(ShadowOffset, OriginBase); @@ -1774,12 +1834,21 @@ DataFlowSanitizer::getShadowOriginAddress(Value *Addr, Align InstAlignment, return {ShadowPtr, OriginPtr}; } +Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos, + Value *ShadowOffset) { + IRBuilder<> IRB(Pos); + + if (!ShadowPtrMul->isOne()) + ShadowOffset = IRB.CreateMul(ShadowOffset, ShadowPtrMul); + + return IRB.CreateIntToPtr(ShadowOffset, PrimitiveShadowPtrTy); +} + Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) { // Returns (Addr & shadow_mask) x 2 IRBuilder<> IRB(Pos); Value *ShadowOffset = getShadowOffset(Addr, IRB); - return IRB.CreateIntToPtr(IRB.CreateMul(ShadowOffset, ShadowPtrMul), - PrimitiveShadowPtrTy); + return getShadowAddress(Addr, Pos, ShadowOffset); } Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2, @@ -1829,7 +1898,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) { Value *PV2 = collapseToPrimitiveShadow(V2, Pos); IRBuilder<> IRB(Pos); - if (ClFast16Labels) { + if (DFS.hasFastLabelsEnabled()) { CCS.Block = Pos->getParent(); CCS.Shadow = IRB.CreateOr(PV1, PV2); } else if (AvoidNewBlocks) { @@ -1978,27 +2047,53 @@ bool DFSanFunction::useCallbackLoadLabelAndOrigin(uint64_t Size, std::pair DFSanFunction::loadFast16ShadowFast( Value *ShadowAddr, Value *OriginAddr, uint64_t Size, Align ShadowAlign, Align OriginAlign, Value *FirstOrigin, Instruction *Pos) { - // First OR all the WideShadows, then OR individual shadows within the - // combined WideShadow. This is fewer instructions than ORing shadows - // individually. const bool ShouldTrackOrigins = DFS.shouldTrackOrigins(); + const uint64_t ShadowSize = Size * DFS.ShadowWidthBytes; + + assert(Size >= 4 && "Not large enough load size for fast path!"); + + // Used for origin tracking. std::vector Shadows; std::vector Origins; + + // Load instructions in LLVM can have arbitrary byte sizes (e.g., 3, 12, 20) + // but this function is only used in a subset of cases that make it possible + // to optimize the instrumentation. + // + // Specifically, when the shadow size in bytes (i.e., loaded bytes x shadow + // per byte) is either: + // - a multiple of 8 (common) + // - equal to 4 (only for load32 in fast-8 mode) + // + // For the second case, we can fit the wide shadow in a 32-bit integer. In all + // other cases, we use a 64-bit integer to hold the wide shadow. + Type *WideShadowTy = + ShadowSize == 4 ? Type::getInt32Ty(*DFS.Ctx) : Type::getInt64Ty(*DFS.Ctx); + IRBuilder<> IRB(Pos); - Value *WideAddr = - IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx)); + Value *WideAddr = IRB.CreateBitCast(ShadowAddr, WideShadowTy->getPointerTo()); Value *CombinedWideShadow = - IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign); + IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign); + if (ShouldTrackOrigins) { Shadows.push_back(CombinedWideShadow); Origins.push_back(FirstOrigin); } - for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size; - Ofs += 64 / DFS.ShadowWidthBits) { - WideAddr = IRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr, + + // First OR all the WideShadows (i.e., 64bit or 32bit shadow chunks) linearly; + // then OR individual shadows within the combined WideShadow by binary ORing. + // This is fewer instructions than ORing shadows individually, since it + // needs logN shift/or instructions (N being the bytes of the combined wide + // shadow). + unsigned WideShadowBitWidth = WideShadowTy->getIntegerBitWidth(); + const uint64_t BytesPerWideShadow = WideShadowBitWidth / DFS.ShadowWidthBits; + + for (uint64_t ByteOfs = BytesPerWideShadow; ByteOfs < Size; + ByteOfs += BytesPerWideShadow) { + WideAddr = IRB.CreateGEP(WideShadowTy, WideAddr, ConstantInt::get(DFS.IntptrTy, 1)); Value *NextWideShadow = - IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign); + IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign); CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, NextWideShadow); if (ShouldTrackOrigins) { Shadows.push_back(NextWideShadow); @@ -2008,7 +2103,8 @@ std::pair DFSanFunction::loadFast16ShadowFast( IRB.CreateAlignedLoad(DFS.OriginTy, OriginAddr, OriginAlign)); } } - for (unsigned Width = 32; Width >= DFS.ShadowWidthBits; Width >>= 1) { + for (unsigned Width = WideShadowBitWidth / 2; Width >= DFS.ShadowWidthBits; + Width >>= 1) { Value *ShrShadow = IRB.CreateLShr(CombinedWideShadow, Width); CombinedWideShadow = IRB.CreateOr(CombinedWideShadow, ShrShadow); } @@ -2023,24 +2119,33 @@ Value *DFSanFunction::loadLegacyShadowFast(Value *ShadowAddr, uint64_t Size, Align ShadowAlign, Instruction *Pos) { // Fast path for the common case where each byte has identical shadow: load - // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any - // shadow is non-equal. + // shadow 64 (or 32) bits at a time, fall out to a __dfsan_union_load call if + // any shadow is non-equal. BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F); IRBuilder<> FallbackIRB(FallbackBB); CallInst *FallbackCall = FallbackIRB.CreateCall( DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)}); FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt); + const uint64_t ShadowSize = Size * DFS.ShadowWidthBytes; + assert(Size >= 4 && "Not large enough load size for fast path!"); + + // Same as in loadFast16AShadowsFast. In the case of load32, we can fit the + // wide shadow in a 32-bit integer instead. + Type *WideShadowTy = + ShadowSize == 4 ? Type::getInt32Ty(*DFS.Ctx) : Type::getInt64Ty(*DFS.Ctx); + // Compare each of the shadows stored in the loaded 64 bits to each other, // by computing (WideShadow rotl ShadowWidthBits) == WideShadow. IRBuilder<> IRB(Pos); - Value *WideAddr = - IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx)); + unsigned WideShadowBitWidth = WideShadowTy->getIntegerBitWidth(); + Value *WideAddr = IRB.CreateBitCast(ShadowAddr, WideShadowTy->getPointerTo()); Value *WideShadow = - IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign); + IRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign); Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.PrimitiveShadowTy); Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits); - Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits); + Value *ShrShadow = + IRB.CreateLShr(WideShadow, WideShadowBitWidth - DFS.ShadowWidthBits); Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow); Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow); @@ -2063,15 +2168,17 @@ Value *DFSanFunction::loadLegacyShadowFast(Value *ShadowAddr, uint64_t Size, ReplaceInstWithInst(Head->getTerminator(), LastBr); DT.addNewBlock(FallbackBB, Head); - for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size; - Ofs += 64 / DFS.ShadowWidthBits) { + const uint64_t BytesPerWideShadow = WideShadowBitWidth / DFS.ShadowWidthBits; + + for (uint64_t ByteOfs = BytesPerWideShadow; ByteOfs < Size; + ByteOfs += BytesPerWideShadow) { BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F); DT.addNewBlock(NextBB, LastBr->getParent()); IRBuilder<> NextIRB(NextBB); - WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr, + WideAddr = NextIRB.CreateGEP(WideShadowTy, WideAddr, ConstantInt::get(DFS.IntptrTy, 1)); Value *NextWideShadow = - NextIRB.CreateAlignedLoad(NextIRB.getInt64Ty(), WideAddr, ShadowAlign); + NextIRB.CreateAlignedLoad(WideShadowTy, WideAddr, ShadowAlign); ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow); LastBr->setSuccessor(0, NextBB); LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB); @@ -2158,6 +2265,8 @@ std::pair DFSanFunction::loadShadowOrigin(Value *Addr, Origin = IRB.CreateAlignedLoad(DFS.OriginTy, OriginAddr, OriginAlign); } + // When the byte size is small enough, we can load the shadow directly with + // just a few instructions. switch (Size) { case 1: { LoadInst *LI = new LoadInst(DFS.PrimitiveShadowTy, ShadowAddr, "", Pos); @@ -2175,17 +2284,21 @@ std::pair DFSanFunction::loadShadowOrigin(Value *Addr, return {combineShadows(Load, Load1, Pos), Origin}; } } + uint64_t ShadowSize = Size * DFS.ShadowWidthBytes; + bool HasSizeForFastPath = ShadowSize % 8 == 0 || ShadowSize == 4; + bool HasFastLabelsEnabled = DFS.hasFastLabelsEnabled(); - if (ClFast16Labels && Size % (64 / DFS.ShadowWidthBits) == 0) + if (HasFastLabelsEnabled && HasSizeForFastPath) return loadFast16ShadowFast(ShadowAddr, OriginAddr, Size, ShadowAlign, OriginAlign, Origin, Pos); - if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0) + if (!AvoidNewBlocks && HasSizeForFastPath) return {loadLegacyShadowFast(ShadowAddr, Size, ShadowAlign, Pos), Origin}; IRBuilder<> IRB(Pos); - FunctionCallee &UnionLoadFn = - ClFast16Labels ? DFS.DFSanUnionLoadFast16LabelsFn : DFS.DFSanUnionLoadFn; + FunctionCallee &UnionLoadFn = HasFastLabelsEnabled + ? DFS.DFSanUnionLoadFastLabelsFn + : DFS.DFSanUnionLoadFn; CallInst *FallbackCall = IRB.CreateCall( UnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)}); FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt); @@ -2406,7 +2519,10 @@ void DFSanFunction::storePrimitiveShadowOrigin(Value *Addr, uint64_t Size, std::tie(ShadowAddr, OriginAddr) = DFS.getShadowOriginAddress(Addr, InstAlignment, Pos); - const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits; + const unsigned ShadowVecSize = 8; + assert(ShadowVecSize * DFS.ShadowWidthBits <= 128 && + "Shadow vector is too large!"); + uint64_t Offset = 0; uint64_t LeftSize = Size; if (LeftSize >= ShadowVecSize) { diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll b/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll index 3750ce346586..11a09af8dc00 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/abilist.ll @@ -1,4 +1,6 @@ ; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s +; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll b/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll index 1bcc78517ca9..6d59e96a9401 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/abilist_aggregate.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI ; RUN: opt < %s -dfsan -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,LEGACY ; RUN: opt < %s -dfsan -dfsan-args-abi -dfsan-abilist=%S/Inputs/abilist.txt -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/array.ll b/llvm/test/Instrumentation/DataFlowSanitizer/array.ll index 939a7f81ba96..fc88061480d5 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/array.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/array.ll @@ -1,10 +1,15 @@ ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,LEGACY ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-event-callbacks=true -S | FileCheck %s --check-prefixes=CHECK,EVENT_CALLBACKS +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-event-callbacks=true -S | FileCheck %s --check-prefixes=CHECK,EVENT_CALLBACKS ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI -; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST16 +; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -90,12 +95,12 @@ define [1 x i1] @load_array1([1 x i1]* %p) { ; EVENT_CALLBACKS: [[L:%.*]] = or i[[#SBITS]] ; EVENT_CALLBACKS: call void @__dfsan_load_callback(i[[#SBITS]] [[L]], i8* {{.*}}) - ; FAST16: @"dfs$load_array1" - ; FAST16: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] - ; FAST16: [[L:%.*]] = load i[[#SBITS]], i[[#SBITS]]* {{.*}}, align [[#SBYTES]] - ; FAST16: [[U:%.*]] = or i[[#SBITS]] [[L]], [[P]] - ; FAST16: [[S1:%.*]] = insertvalue [1 x i[[#SBITS]]] undef, i[[#SBITS]] [[U]], 0 - ; FAST16: store [1 x i[[#SBITS]]] [[S1]], [1 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [1 x i[[#SBITS]]]*), align [[ALIGN]] + ; FAST: @"dfs$load_array1" + ; FAST: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] + ; FAST: [[L:%.*]] = load i[[#SBITS]], i[[#SBITS]]* {{.*}}, align [[#SBYTES]] + ; FAST: [[U:%.*]] = or i[[#SBITS]] [[L]], [[P]] + ; FAST: [[S1:%.*]] = insertvalue [1 x i[[#SBITS]]] undef, i[[#SBITS]] [[U]], 0 + ; FAST: store [1 x i[[#SBITS]]] [[S1]], [1 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [1 x i[[#SBITS]]]*), align [[ALIGN]] ; LEGACY: @"dfs$load_array1" ; LEGACY: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] @@ -123,13 +128,13 @@ define [2 x i1] @load_array2([2 x i1]* %p) { ; EVENT_CALLBACKS: [[O2:%.*]] = or i[[#SBITS]] [[O1]] ; EVENT_CALLBACKS: call void @__dfsan_load_callback(i[[#SBITS]] [[O2]], i8* {{.*}}) - ; FAST16: @"dfs$load_array2" - ; FAST16: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] - ; FAST16: [[O:%.*]] = or i[[#SBITS]] - ; FAST16: [[U:%.*]] = or i[[#SBITS]] [[O]], [[P]] - ; FAST16: [[S:%.*]] = insertvalue [2 x i[[#SBITS]]] undef, i[[#SBITS]] [[U]], 0 - ; FAST16: [[S1:%.*]] = insertvalue [2 x i[[#SBITS]]] [[S]], i[[#SBITS]] [[U]], 1 - ; FAST16: store [2 x i[[#SBITS]]] [[S1]], [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [2 x i[[#SBITS]]]*), align [[ALIGN]] + ; FAST: @"dfs$load_array2" + ; FAST: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] + ; FAST: [[O:%.*]] = or i[[#SBITS]] + ; FAST: [[U:%.*]] = or i[[#SBITS]] [[O]], [[P]] + ; FAST: [[S:%.*]] = insertvalue [2 x i[[#SBITS]]] undef, i[[#SBITS]] [[U]], 0 + ; FAST: [[S1:%.*]] = insertvalue [2 x i[[#SBITS]]] [[S]], i[[#SBITS]] [[U]], 1 + ; FAST: store [2 x i[[#SBITS]]] [[S1]], [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [2 x i[[#SBITS]]]*), align [[ALIGN]] %a = load [2 x i1], [2 x i1]* %p ret [2 x i1] %a } @@ -150,14 +155,14 @@ define [4 x i1] @load_array4([4 x i1]* %p) { ; EVENT_CALLBACKS: [[O3:%.*]] = or i[[#SBITS]] [[O2]] ; EVENT_CALLBACKS: call void @__dfsan_load_callback(i[[#SBITS]] [[O3]], i8* {{.*}}) - ; FAST16: @"dfs$load_array4" - ; FAST16: [[T:%.*]] = trunc i[[#mul(4, SBITS)]] {{.*}} to i[[#SBITS]] - ; FAST16: [[O:%.*]] = or i[[#SBITS]] [[T]] - ; FAST16: [[S1:%.*]] = insertvalue [4 x i[[#SBITS]]] undef, i[[#SBITS]] [[O]], 0 - ; FAST16: [[S2:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S1]], i[[#SBITS]] [[O]], 1 - ; FAST16: [[S3:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S2]], i[[#SBITS]] [[O]], 2 - ; FAST16: [[S4:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S3]], i[[#SBITS]] [[O]], 3 - ; FAST16: store [4 x i[[#SBITS]]] [[S4]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align 2 + ; FAST: @"dfs$load_array4" + ; FAST: [[T:%.*]] = trunc i[[#mul(4, SBITS)]] {{.*}} to i[[#SBITS]] + ; FAST: [[O:%.*]] = or i[[#SBITS]] [[T]] + ; FAST: [[S1:%.*]] = insertvalue [4 x i[[#SBITS]]] undef, i[[#SBITS]] [[O]], 0 + ; FAST: [[S2:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S1]], i[[#SBITS]] [[O]], 1 + ; FAST: [[S3:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S2]], i[[#SBITS]] [[O]], 2 + ; FAST: [[S4:%.*]] = insertvalue [4 x i[[#SBITS]]] [[S3]], i[[#SBITS]] [[O]], 3 + ; FAST: store [4 x i[[#SBITS]]] [[S4]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align 2 ; LEGACY: @"dfs$load_array4" ; LEGACY: [[P:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] @@ -191,25 +196,25 @@ define [4 x i1] @insert_array([4 x i1] %a, i1 %e2) { } define void @store_alloca_array([4 x i1] %a) { - ; FAST16: @"dfs$store_alloca_array" - ; FAST16: [[S:%.*]] = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN:2]] - ; FAST16: [[SP:%.*]] = alloca i[[#SBITS]], align [[#SBYTES]] - ; FAST16: [[E0:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 0 - ; FAST16: [[E1:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 1 - ; FAST16: [[E01:%.*]] = or i[[#SBITS]] [[E0]], [[E1]] - ; FAST16: [[E2:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 2 - ; FAST16: [[E012:%.*]] = or i[[#SBITS]] [[E01]], [[E2]] - ; FAST16: [[E3:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 3 - ; FAST16: [[E0123:%.*]] = or i[[#SBITS]] [[E012]], [[E3]] - ; FAST16: store i[[#SBITS]] [[E0123]], i[[#SBITS]]* [[SP]], align [[#SBYTES]] + ; FAST: @"dfs$store_alloca_array" + ; FAST: [[S:%.*]] = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN:2]] + ; FAST: [[SP:%.*]] = alloca i[[#SBITS]], align [[#SBYTES]] + ; FAST: [[E0:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 0 + ; FAST: [[E1:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 1 + ; FAST: [[E01:%.*]] = or i[[#SBITS]] [[E0]], [[E1]] + ; FAST: [[E2:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 2 + ; FAST: [[E012:%.*]] = or i[[#SBITS]] [[E01]], [[E2]] + ; FAST: [[E3:%.*]] = extractvalue [4 x i[[#SBITS]]] [[S]], 3 + ; FAST: [[E0123:%.*]] = or i[[#SBITS]] [[E012]], [[E3]] + ; FAST: store i[[#SBITS]] [[E0123]], i[[#SBITS]]* [[SP]], align [[#SBYTES]] %p = alloca [4 x i1] store [4 x i1] %a, [4 x i1]* %p ret void } define void @store_zero_array([4 x i1]* %p) { - ; FAST16: @"dfs$store_zero_array" - ; FAST16: store i[[#mul(4, SBITS)]] 0, i[[#mul(4, SBITS)]]* {{.*}} + ; FAST: @"dfs$store_zero_array" + ; FAST: store i[[#mul(4, SBITS)]] 0, i[[#mul(4, SBITS)]]* {{.*}} store [4 x i1] zeroinitializer, [4 x i1]* %p ret void } @@ -227,15 +232,15 @@ define void @store_array2([2 x i1] %a, [2 x i1]* %p) { ; EVENT_CALLBACKS: [[P:%.*]] = bitcast [2 x i1]* %p to i8* ; EVENT_CALLBACKS: call void @__dfsan_store_callback(i[[#SBITS]] [[E12]], i8* [[P]]) - ; FAST16: @"dfs$store_array2" - ; FAST16: [[S:%.*]] = load [2 x i[[#SBITS]]], [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [2 x i[[#SBITS]]]*), align [[ALIGN:2]] - ; FAST16: [[E1:%.*]] = extractvalue [2 x i[[#SBITS]]] [[S]], 0 - ; FAST16: [[E2:%.*]] = extractvalue [2 x i[[#SBITS]]] [[S]], 1 - ; FAST16: [[E12:%.*]] = or i[[#SBITS]] [[E1]], [[E2]] - ; FAST16: [[SP0:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SP:%.*]], i32 0 - ; FAST16: store i[[#SBITS]] [[E12]], i[[#SBITS]]* [[SP0]], align [[#SBYTES]] - ; FAST16: [[SP1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SP]], i32 1 - ; FAST16: store i[[#SBITS]] [[E12]], i[[#SBITS]]* [[SP1]], align [[#SBYTES]] + ; FAST: @"dfs$store_array2" + ; FAST: [[S:%.*]] = load [2 x i[[#SBITS]]], [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [2 x i[[#SBITS]]]*), align [[ALIGN:2]] + ; FAST: [[E1:%.*]] = extractvalue [2 x i[[#SBITS]]] [[S]], 0 + ; FAST: [[E2:%.*]] = extractvalue [2 x i[[#SBITS]]] [[S]], 1 + ; FAST: [[E12:%.*]] = or i[[#SBITS]] [[E1]], [[E2]] + ; FAST: [[SP0:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SP:%.*]], i32 0 + ; FAST: store i[[#SBITS]] [[E12]], i[[#SBITS]]* [[SP0]], align [[#SBYTES]] + ; FAST: [[SP1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[SP]], i32 1 + ; FAST: store i[[#SBITS]] [[E12]], i[[#SBITS]]* [[SP1]], align [[#SBYTES]] ; COMBINE_STORE_PTR: @"dfs$store_array2" ; COMBINE_STORE_PTR: [[O:%.*]] = or i[[#SBITS]] @@ -250,72 +255,72 @@ define void @store_array2([2 x i1] %a, [2 x i1]* %p) { } define void @store_array17([17 x i1] %a, [17 x i1]* %p) { - ; FAST16: @"dfs$store_array17" - ; FAST16: %[[#R:]] = load [17 x i[[#SBITS]]], [17 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [17 x i[[#SBITS]]]*), align 2 - ; FAST16: %[[#R+1]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 0 - ; FAST16: %[[#R+2]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 1 - ; FAST16: %[[#R+3]] = or i[[#SBITS]] %[[#R+1]], %[[#R+2]] - ; FAST16: %[[#R+4]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 2 - ; FAST16: %[[#R+5]] = or i[[#SBITS]] %[[#R+3]], %[[#R+4]] - ; FAST16: %[[#R+6]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 3 - ; FAST16: %[[#R+7]] = or i[[#SBITS]] %[[#R+5]], %[[#R+6]] - ; FAST16: %[[#R+8]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 4 - ; FAST16: %[[#R+9]] = or i[[#SBITS]] %[[#R+7]], %[[#R+8]] - ; FAST16: %[[#R+10]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 5 - ; FAST16: %[[#R+11]] = or i[[#SBITS]] %[[#R+9]], %[[#R+10]] - ; FAST16: %[[#R+12]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 6 - ; FAST16: %[[#R+13]] = or i[[#SBITS]] %[[#R+11]], %[[#R+12]] - ; FAST16: %[[#R+14]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 7 - ; FAST16: %[[#R+15]] = or i[[#SBITS]] %[[#R+13]], %[[#R+14]] - ; FAST16: %[[#R+16]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 8 - ; FAST16: %[[#R+17]] = or i[[#SBITS]] %[[#R+15]], %[[#R+16]] - ; FAST16: %[[#R+18]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 9 - ; FAST16: %[[#R+19]] = or i[[#SBITS]] %[[#R+17]], %[[#R+18]] - ; FAST16: %[[#R+20]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 10 - ; FAST16: %[[#R+21]] = or i[[#SBITS]] %[[#R+19]], %[[#R+20]] - ; FAST16: %[[#R+22]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 11 - ; FAST16: %[[#R+23]] = or i[[#SBITS]] %[[#R+21]], %[[#R+22]] - ; FAST16: %[[#R+24]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 12 - ; FAST16: %[[#R+25]] = or i[[#SBITS]] %[[#R+23]], %[[#R+24]] - ; FAST16: %[[#R+26]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 13 - ; FAST16: %[[#R+27]] = or i[[#SBITS]] %[[#R+25]], %[[#R+26]] - ; FAST16: %[[#R+28]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 14 - ; FAST16: %[[#R+29]] = or i[[#SBITS]] %[[#R+27]], %[[#R+28]] - ; FAST16: %[[#R+30]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 15 - ; FAST16: %[[#R+31]] = or i[[#SBITS]] %[[#R+29]], %[[#R+30]] - ; FAST16: %[[#R+32]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 16 - ; FAST16: %[[#R+33]] = or i[[#SBITS]] %[[#R+31]], %[[#R+32]] - ; FAST16: %[[#VREG:]] = insertelement <8 x i[[#SBITS]]> undef, i[[#SBITS]] %[[#R+33]], i32 0 - ; FAST16: %[[#VREG+1]] = insertelement <8 x i[[#SBITS]]> %[[#VREG]], i[[#SBITS]] %[[#R+33]], i32 1 - ; FAST16: %[[#VREG+2]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+1]], i[[#SBITS]] %[[#R+33]], i32 2 - ; FAST16: %[[#VREG+3]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+2]], i[[#SBITS]] %[[#R+33]], i32 3 - ; FAST16: %[[#VREG+4]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+3]], i[[#SBITS]] %[[#R+33]], i32 4 - ; FAST16: %[[#VREG+5]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+4]], i[[#SBITS]] %[[#R+33]], i32 5 - ; FAST16: %[[#VREG+6]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+5]], i[[#SBITS]] %[[#R+33]], i32 6 - ; FAST16: %[[#VREG+7]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+6]], i[[#SBITS]] %[[#R+33]], i32 7 - ; FAST16: %[[#VREG+8]] = bitcast i[[#SBITS]]* %[[P:.*]] to <8 x i[[#SBITS]]>* - ; FAST16: %[[#VREG+9]] = getelementptr <8 x i[[#SBITS]]>, <8 x i[[#SBITS]]>* %[[#VREG+8]], i32 0 - ; FAST16: store <8 x i[[#SBITS]]> %[[#VREG+7]], <8 x i[[#SBITS]]>* %[[#VREG+9]], align [[#SBYTES]] - ; FAST16: %[[#VREG+10]] = getelementptr <8 x i[[#SBITS]]>, <8 x i[[#SBITS]]>* %[[#VREG+8]], i32 1 - ; FAST16: store <8 x i[[#SBITS]]> %[[#VREG+7]], <8 x i[[#SBITS]]>* %[[#VREG+10]], align [[#SBYTES]] - ; FAST16: %[[#VREG+11]] = getelementptr i[[#SBITS]], i[[#SBITS]]* %[[P]], i32 16 - ; FAST16: store i[[#SBITS]] %[[#R+33]], i[[#SBITS]]* %[[#VREG+11]], align [[#SBYTES]] + ; FAST: @"dfs$store_array17" + ; FAST: %[[#R:]] = load [17 x i[[#SBITS]]], [17 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [17 x i[[#SBITS]]]*), align 2 + ; FAST: %[[#R+1]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 0 + ; FAST: %[[#R+2]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 1 + ; FAST: %[[#R+3]] = or i[[#SBITS]] %[[#R+1]], %[[#R+2]] + ; FAST: %[[#R+4]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 2 + ; FAST: %[[#R+5]] = or i[[#SBITS]] %[[#R+3]], %[[#R+4]] + ; FAST: %[[#R+6]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 3 + ; FAST: %[[#R+7]] = or i[[#SBITS]] %[[#R+5]], %[[#R+6]] + ; FAST: %[[#R+8]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 4 + ; FAST: %[[#R+9]] = or i[[#SBITS]] %[[#R+7]], %[[#R+8]] + ; FAST: %[[#R+10]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 5 + ; FAST: %[[#R+11]] = or i[[#SBITS]] %[[#R+9]], %[[#R+10]] + ; FAST: %[[#R+12]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 6 + ; FAST: %[[#R+13]] = or i[[#SBITS]] %[[#R+11]], %[[#R+12]] + ; FAST: %[[#R+14]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 7 + ; FAST: %[[#R+15]] = or i[[#SBITS]] %[[#R+13]], %[[#R+14]] + ; FAST: %[[#R+16]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 8 + ; FAST: %[[#R+17]] = or i[[#SBITS]] %[[#R+15]], %[[#R+16]] + ; FAST: %[[#R+18]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 9 + ; FAST: %[[#R+19]] = or i[[#SBITS]] %[[#R+17]], %[[#R+18]] + ; FAST: %[[#R+20]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 10 + ; FAST: %[[#R+21]] = or i[[#SBITS]] %[[#R+19]], %[[#R+20]] + ; FAST: %[[#R+22]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 11 + ; FAST: %[[#R+23]] = or i[[#SBITS]] %[[#R+21]], %[[#R+22]] + ; FAST: %[[#R+24]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 12 + ; FAST: %[[#R+25]] = or i[[#SBITS]] %[[#R+23]], %[[#R+24]] + ; FAST: %[[#R+26]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 13 + ; FAST: %[[#R+27]] = or i[[#SBITS]] %[[#R+25]], %[[#R+26]] + ; FAST: %[[#R+28]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 14 + ; FAST: %[[#R+29]] = or i[[#SBITS]] %[[#R+27]], %[[#R+28]] + ; FAST: %[[#R+30]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 15 + ; FAST: %[[#R+31]] = or i[[#SBITS]] %[[#R+29]], %[[#R+30]] + ; FAST: %[[#R+32]] = extractvalue [17 x i[[#SBITS]]] %[[#R]], 16 + ; FAST: %[[#R+33]] = or i[[#SBITS]] %[[#R+31]], %[[#R+32]] + ; FAST: %[[#VREG:]] = insertelement <8 x i[[#SBITS]]> undef, i[[#SBITS]] %[[#R+33]], i32 0 + ; FAST: %[[#VREG+1]] = insertelement <8 x i[[#SBITS]]> %[[#VREG]], i[[#SBITS]] %[[#R+33]], i32 1 + ; FAST: %[[#VREG+2]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+1]], i[[#SBITS]] %[[#R+33]], i32 2 + ; FAST: %[[#VREG+3]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+2]], i[[#SBITS]] %[[#R+33]], i32 3 + ; FAST: %[[#VREG+4]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+3]], i[[#SBITS]] %[[#R+33]], i32 4 + ; FAST: %[[#VREG+5]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+4]], i[[#SBITS]] %[[#R+33]], i32 5 + ; FAST: %[[#VREG+6]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+5]], i[[#SBITS]] %[[#R+33]], i32 6 + ; FAST: %[[#VREG+7]] = insertelement <8 x i[[#SBITS]]> %[[#VREG+6]], i[[#SBITS]] %[[#R+33]], i32 7 + ; FAST: %[[#VREG+8]] = bitcast i[[#SBITS]]* %[[P:.*]] to <8 x i[[#SBITS]]>* + ; FAST: %[[#VREG+9]] = getelementptr <8 x i[[#SBITS]]>, <8 x i[[#SBITS]]>* %[[#VREG+8]], i32 0 + ; FAST: store <8 x i[[#SBITS]]> %[[#VREG+7]], <8 x i[[#SBITS]]>* %[[#VREG+9]], align [[#SBYTES]] + ; FAST: %[[#VREG+10]] = getelementptr <8 x i[[#SBITS]]>, <8 x i[[#SBITS]]>* %[[#VREG+8]], i32 1 + ; FAST: store <8 x i[[#SBITS]]> %[[#VREG+7]], <8 x i[[#SBITS]]>* %[[#VREG+10]], align [[#SBYTES]] + ; FAST: %[[#VREG+11]] = getelementptr i[[#SBITS]], i[[#SBITS]]* %[[P]], i32 16 + ; FAST: store i[[#SBITS]] %[[#R+33]], i[[#SBITS]]* %[[#VREG+11]], align [[#SBYTES]] store [17 x i1] %a, [17 x i1]* %p ret void } define [2 x i32] @const_array() { - ; FAST16: @"dfs$const_array" - ; FAST16: store [2 x i[[#SBITS]]] zeroinitializer, [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [2 x i[[#SBITS]]]*), align 2 + ; FAST: @"dfs$const_array" + ; FAST: store [2 x i[[#SBITS]]] zeroinitializer, [2 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [2 x i[[#SBITS]]]*), align 2 ret [2 x i32] [ i32 42, i32 11 ] } define [4 x i8] @call_array([4 x i8] %a) { - ; FAST16-LABEL: @"dfs$call_array" - ; FAST16: %[[#R:]] = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN:2]] - ; FAST16: store [4 x i[[#SBITS]]] %[[#R]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]] - ; FAST16: %_dfsret = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]] - ; FAST16: store [4 x i[[#SBITS]]] %_dfsret, [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]] + ; FAST-LABEL: @"dfs$call_array" + ; FAST: %[[#R:]] = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN:2]] + ; FAST: store [4 x i[[#SBITS]]] %[[#R]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]] + ; FAST: %_dfsret = load [4 x i[[#SBITS]]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]] + ; FAST: store [4 x i[[#SBITS]]] %_dfsret, [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]] %r = call [4 x i8] @pass_array([4 x i8] %a) ret [4 x i8] %r @@ -324,31 +329,31 @@ define [4 x i8] @call_array([4 x i8] %a) { %LargeArr = type [1000 x i8] define i8 @fun_with_large_args(i1 %i, %LargeArr %a) { - ; FAST16: @"dfs$fun_with_large_args" - ; FAST16: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2 + ; FAST: @"dfs$fun_with_large_args" + ; FAST: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2 %r = extractvalue %LargeArr %a, 0 ret i8 %r } define %LargeArr @fun_with_large_ret() { - ; FAST16: @"dfs$fun_with_large_ret" - ; FAST16-NEXT: ret [1000 x i8] zeroinitializer + ; FAST: @"dfs$fun_with_large_ret" + ; FAST-NEXT: ret [1000 x i8] zeroinitializer ret %LargeArr zeroinitializer } define i8 @call_fun_with_large_ret() { - ; FAST16: @"dfs$call_fun_with_large_ret" - ; FAST16: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2 + ; FAST: @"dfs$call_fun_with_large_ret" + ; FAST: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2 %r = call %LargeArr @fun_with_large_ret() %e = extractvalue %LargeArr %r, 0 ret i8 %e } define i8 @call_fun_with_large_args(i1 %i, %LargeArr %a) { - ; FAST16: @"dfs$call_fun_with_large_args" - ; FAST16: [[I:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] - ; FAST16: store i[[#SBITS]] [[I]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] - ; FAST16: %r = call i8 @"dfs$fun_with_large_args"(i1 %i, [1000 x i8] %a) + ; FAST: @"dfs$call_fun_with_large_args" + ; FAST: [[I:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] + ; FAST: store i[[#SBITS]] [[I]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: %r = call i8 @"dfs$fun_with_large_args"(i1 %i, [1000 x i8] %a) %r = call i8 @fun_with_large_args(i1 %i, %LargeArr %a) ret i8 %r diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll index f5a225b0cb04..c917774b4506 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll @@ -1,4 +1,5 @@ ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16 +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK ; ; The patterns about origins cannot be tested until the origin tracking feature is complete. target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll index 98fb755992bf..e45a8b4fc3da 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/basic.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,CHECK_NO_ORIGIN -DSHADOW_MASK=-123145302310913 ; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK_ORIGIN -DSHADOW_MASK=-123145302310913 +; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK_NO_ORIGIN -DSHADOW_MASK=-105553116266497 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/call.ll b/llvm/test/Instrumentation/DataFlowSanitizer/call.ll index cbe62fa37e3e..bb47d8c6ba30 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/call.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/call.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -dfsan -S | FileCheck %s ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -S | FileCheck %s +; RUN: opt < %s -dfsan -dfsan-fast-8-labels -S | FileCheck %s ; RUN: opt < %s -passes=dfsan -S | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll b/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll index 5b538febb3e3..2b5cd5e3065c 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/external_mask.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,CHECK16 ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16 +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll b/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll index 00f05d0c8fb1..c6b998fb28e6 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/fast16labels.ll @@ -1,6 +1,7 @@ ; Test that -dfsan-fast-16-labels mode uses inline ORs rather than calling ; __dfsan_union or __dfsan_union_load. ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -S | FileCheck %s --implicit-check-not="call{{.*}}__dfsan_union" --check-prefixes=CHECK,CHECK16 +; RUN: opt < %s -dfsan -dfsan-fast-8-labels -S | FileCheck %s --implicit-check-not="call{{.*}}__dfsan_union" --check-prefixes=CHECK,CHECK8 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -13,7 +14,7 @@ define i8 @add(i8 %a, i8 %b) { ; CHECK-LABEL: define i8 @"dfs$add" ; CHECK-DAG: %[[ALABEL:.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] ; CHECK-DAG: %[[BLABEL:.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN]] - ; CHECK: %[[ADDLABEL:.*]] = or i16 %[[ALABEL]], %[[BLABEL]] + ; CHECK: %[[ADDLABEL:.*]] = or i[[#SBITS]] %[[ALABEL]], %[[BLABEL]] ; CHECK: %c = add i8 %a, %b ; CHECK: store i[[#SBITS]] %[[ADDLABEL]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] ; CHECK: ret i8 %c @@ -24,7 +25,7 @@ define i8 @add(i8 %a, i8 %b) { define i8 @load8(i8* %p) { ; CHECK-LABEL: define i8 @"dfs$load8" ; CHECK-SAME: (i8* %[[PADDR:.*]]) - ; CHECK-NEXT: %[[#ARG:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i16*), align [[ALIGN]] + ; CHECK-NEXT: %[[#ARG:]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] ; CHECK-NEXT: %[[#R:]] = ptrtoint i8* %[[PADDR]] to i64 ; CHECK-NEXT: %[[#PS:R+1]] = and i64 %[[#R]], [[#%.10d,MASK:]] ; CHECK16-NEXT: %[[#PS:R+2]] = mul i64 %[[#R+1]], 2 @@ -106,6 +107,16 @@ define i64 @load64(i64* %p) { ; CHECK16-NEXT: %[[#WS+5]] = trunc i64 %[[#WS+4]] to i[[#SBITS]] ; CHECK16-NEXT: %[[#S_OUT:]] = or i[[#SBITS]] %[[#WS+5]], %[[#ARG]] + ; COMM: On fast8, no need to OR the wide shadow but one more shift is needed. + ; CHECK8-NEXT: %[[#WS+1]] = lshr i64 %[[#WS]], 32 + ; CHECK8-NEXT: %[[#WS+2]] = or i64 %[[#WS]], %[[#WS+1]] + ; CHECK8-NEXT: %[[#WS+3]] = lshr i64 %[[#WS+2]], 16 + ; CHECK8-NEXT: %[[#WS+4]] = or i64 %[[#WS+2]], %[[#WS+3]] + ; CHECK8-NEXT: %[[#WS+5]] = lshr i64 %[[#WS+4]], 8 + ; CHECK8-NEXT: %[[#WS+6]] = or i64 %[[#WS+4]], %[[#WS+5]] + ; CHECK8-NEXT: %[[#WS+7]] = trunc i64 %[[#WS+6]] to i[[#SBITS]] + ; CHECK8-NEXT: %[[#S_OUT:]] = or i[[#SBITS]] %[[#WS+7]], %[[#ARG]] + ; CHECK-NEXT: %a = load i64, i64* %p ; CHECK-NEXT: store i[[#SBITS]] %[[#S_OUT]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] ; CHECK-NEXT: ret i64 %a @@ -142,6 +153,16 @@ define i128 @load128(i128* %p) { ; CHECK16-NEXT: %[[#WS+5]] = trunc i64 %[[#WS+4]] to i[[#SBITS]] ; CHECK16-NEXT: %[[#S_OUT:]] = or i[[#SBITS]] %[[#WS+5]], %[[#ARG]] + ; COMM: On fast8, we need to OR 2x64bits for the wide shadow, before ORing its bytes (one more shift). + ; CHECK8-NEXT: %[[#WS+1]] = lshr i64 %[[#WS]], 32 + ; CHECK8-NEXT: %[[#WS+2]] = or i64 %[[#WS]], %[[#WS+1]] + ; CHECK8-NEXT: %[[#WS+3]] = lshr i64 %[[#WS+2]], 16 + ; CHECK8-NEXT: %[[#WS+4]] = or i64 %[[#WS+2]], %[[#WS+3]] + ; CHECK8-NEXT: %[[#WS+5]] = lshr i64 %[[#WS+4]], 8 + ; CHECK8-NEXT: %[[#WS+6]] = or i64 %[[#WS+4]], %[[#WS+5]] + ; CHECK8-NEXT: %[[#WS+7]] = trunc i64 %[[#WS+6]] to i[[#SBITS]] + ; CHECK8-NEXT: %[[#S_OUT:]] = or i[[#SBITS]] %[[#WS+7]], %[[#ARG]] + ; CHECK-NEXT: %a = load i128, i128* %p ; CHECK-NEXT: store i[[#SBITS]] %[[#S_OUT]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] ; CHECK-NEXT: ret i128 %a diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll b/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll index e9ef73ecca80..fe2b0ba3b47f 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/phi.ll @@ -1,5 +1,6 @@ ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,LEGACY ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -11,7 +12,7 @@ define {i32, i32} @test({i32, i32} %a, i1 %c) { ; LEGACY: [[PL:%.*]] = phi i[[#SBITS]] [ [[AL]], %T ], [ [[AL]], %F ] ; LEGACY: store i[[#SBITS]] [[PL]], i[[#SBITS]]* bitcast ([100 x i64]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] - ; FAST: [[AL:%.*]] = load { [[ST:i[0-9]+]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([100 x i64]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] + ; FAST: [[AL:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([100 x i64]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] ; FAST: [[AL0:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[AL]], i[[#SBITS]] 0, 0 ; FAST: [[AL1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[AL]], i[[#SBITS]] 0, 1 ; FAST: [[PL:%.*]] = phi { i[[#SBITS]], i[[#SBITS]] } [ [[AL0]], %T ], [ [[AL1]], %F ] diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/select.ll b/llvm/test/Instrumentation/DataFlowSanitizer/select.ll index e15e9932ec7b..6a3355b4a649 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/select.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/select.ll @@ -2,6 +2,8 @@ ; RUN: opt < %s -dfsan -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_LEGACY ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-track-select-control-flow=1 -S | FileCheck %s --check-prefixes=CHECK,TRACK_CF,TRACK_CF_FAST ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_FAST +; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-track-select-control-flow=1 -S | FileCheck %s --check-prefixes=CHECK,TRACK_CF,TRACK_CF_FAST +; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-track-select-control-flow=0 -S | FileCheck %s --check-prefixes=CHECK,NO_TRACK_CF,NO_TRACK_CF_FAST target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll b/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll index a10edd5792d7..9c7440e5ebaf 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/shadow-args-zext.ll @@ -1,5 +1,6 @@ ; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt | FileCheck %s ; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt -dfsan-fast-16-labels | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu < %s -dfsan -S --dfsan-abilist=%S/Inputs/shadow-args-abilist.txt -dfsan-fast-8-labels | FileCheck %s ; REQUIRES: x86-registered-target diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/store.ll b/llvm/test/Instrumentation/DataFlowSanitizer/store.ll index a66b6c86dfde..660656f57d26 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/store.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/store.ll @@ -2,6 +2,8 @@ ; RUN: opt < %s -dfsan -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,NO_COMBINE_PTR_LABEL ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-combine-pointer-labels-on-store=1 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,COMBINE_PTR_LABEL_FAST ; RUN: opt < %s -dfsan -dfsan-fast-16-labels -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,NO_COMBINE_PTR_LABEL +; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-combine-pointer-labels-on-store=1 -S | FileCheck %s --check-prefixes=CHECK,COMBINE_PTR_LABEL_FAST +; RUN: opt < %s -dfsan -dfsan-fast-8-labels -dfsan-combine-pointer-labels-on-store=0 -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_PTR_LABEL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll b/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll index f45a1597cec4..4d68a2c0577c 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/struct.ll @@ -1,11 +1,16 @@ ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,LEGACY ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-event-callbacks=true -S | FileCheck %s --check-prefixes=CHECK,EVENT_CALLBACKS ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI -; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST16 +; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-track-select-control-flow=false -S | FileCheck %s --check-prefixes=CHECK,NO_SELECT_CONTROL ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,FAST +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-load=false -S | FileCheck %s --check-prefixes=CHECK,NO_COMBINE_LOAD_PTR +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-combine-pointer-labels-on-store=true -S | FileCheck %s --check-prefixes=CHECK,COMBINE_STORE_PTR +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-track-select-control-flow=false -S | FileCheck %s --check-prefixes=CHECK,NO_SELECT_CONTROL +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -dfsan-debug-nonzero-labels -S | FileCheck %s --check-prefixes=CHECK,DEBUG_NONZERO_LABELS target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -73,18 +78,18 @@ define {i1, i32} @select_struct(i1 %c, {i1, i32} %a, {i1, i32} %b) { ; NO_SELECT_CONTROL: [[S:%.*]] = select i1 %c, { i[[#SBITS]], i[[#SBITS]] } [[A]], { i[[#SBITS]], i[[#SBITS]] } [[B]] ; NO_SELECT_CONTROL: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] - ; FAST16: @"dfs$select_struct" - ; FAST16: %[[#R:]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] - ; FAST16: %[[#R+1]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] - ; FAST16: %[[#R+2]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] - ; FAST16: %[[#R+3]] = select i1 %c, { i[[#SBITS]], i[[#SBITS]] } %[[#R+1]], { i[[#SBITS]], i[[#SBITS]] } %[[#R]] - ; FAST16: %[[#R+4]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+3]], 0 - ; FAST16: %[[#R+5]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+3]], 1 - ; FAST16: %[[#R+6]] = or i[[#SBITS]] %[[#R+4]], %[[#R+5]] - ; FAST16: %[[#R+7]] = or i[[#SBITS]] %[[#R+2]], %[[#R+6]] - ; FAST16: %[[#R+8]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } undef, i[[#SBITS]] %[[#R+7]], 0 - ; FAST16: %[[#R+9]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+8]], i[[#SBITS]] %[[#R+7]], 1 - ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } %[[#R+9]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: @"dfs$select_struct" + ; FAST: %[[#R:]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] + ; FAST: %[[#R+1]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: %[[#R+2]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: %[[#R+3]] = select i1 %c, { i[[#SBITS]], i[[#SBITS]] } %[[#R+1]], { i[[#SBITS]], i[[#SBITS]] } %[[#R]] + ; FAST: %[[#R+4]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+3]], 0 + ; FAST: %[[#R+5]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+3]], 1 + ; FAST: %[[#R+6]] = or i[[#SBITS]] %[[#R+4]], %[[#R+5]] + ; FAST: %[[#R+7]] = or i[[#SBITS]] %[[#R+2]], %[[#R+6]] + ; FAST: %[[#R+8]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } undef, i[[#SBITS]] %[[#R+7]], 0 + ; FAST: %[[#R+9]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } %[[#R+8]], i[[#SBITS]] %[[#R+7]], 1 + ; FAST: store { i[[#SBITS]], i[[#SBITS]] } %[[#R+9]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] ; LEGACY: @"dfs$select_struct" ; LEGACY: [[U:%.*]] = call zeroext i[[#SBITS]] @__dfsan_union @@ -96,13 +101,13 @@ define {i1, i32} @select_struct(i1 %c, {i1, i32} %a, {i1, i32} %b) { } define { i32, i32 } @asm_struct(i32 %0, i32 %1) { - ; FAST16: @"dfs$asm_struct" - ; FAST16: [[E1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]] - ; FAST16: [[E0:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] - ; FAST16: [[E01:%.*]] = or i[[#SBITS]] [[E0]], [[E1]] - ; FAST16: [[S0:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } undef, i[[#SBITS]] [[E01]], 0 - ; FAST16: [[S1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[S0]], i[[#SBITS]] [[E01]], 1 - ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[S1]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: @"dfs$asm_struct" + ; FAST: [[E1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]] + ; FAST: [[E0:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: [[E01:%.*]] = or i[[#SBITS]] [[E0]], [[E1]] + ; FAST: [[S0:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } undef, i[[#SBITS]] [[E01]], 0 + ; FAST: [[S1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[S0]], i[[#SBITS]] [[E01]], 1 + ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[S1]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] ; LEGACY: @"dfs$asm_struct" ; LEGACY: [[E1:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]] @@ -117,8 +122,8 @@ entry: } define {i32, i32} @const_struct() { - ; FAST16: @"dfs$const_struct" - ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } zeroinitializer, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align 2 + ; FAST: @"dfs$const_struct" + ; FAST: store { i[[#SBITS]], i[[#SBITS]] } zeroinitializer, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align 2 ; LEGACY: @"dfs$const_struct" ; LEGACY: store i[[#SBITS]] 0, i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align 2 @@ -126,10 +131,10 @@ define {i32, i32} @const_struct() { } define i1 @extract_struct({i1, i5} %s) { - ; FAST16: @"dfs$extract_struct" - ; FAST16: [[SM:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] - ; FAST16: [[EM:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[SM]], 0 - ; FAST16: store i[[#SBITS]] [[EM]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: @"dfs$extract_struct" + ; FAST: [[SM:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] + ; FAST: [[EM:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[SM]], 0 + ; FAST: store i[[#SBITS]] [[EM]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] ; LEGACY: @"dfs$extract_struct" ; LEGACY: [[SM:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN:2]] @@ -139,11 +144,11 @@ define i1 @extract_struct({i1, i5} %s) { } define {i1, i5} @insert_struct({i1, i5} %s, i5 %e1) { - ; FAST16: @"dfs$insert_struct" - ; FAST16: [[EM:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES)]]) to i[[#SBITS]]*), align [[ALIGN:2]] - ; FAST16: [[SM:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] - ; FAST16: [[SM1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[SM]], i[[#SBITS]] [[EM]], 1 - ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[SM1]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: @"dfs$insert_struct" + ; FAST: [[EM:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES)]]) to i[[#SBITS]]*), align [[ALIGN:2]] + ; FAST: [[SM:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: [[SM1:%.*]] = insertvalue { i[[#SBITS]], i[[#SBITS]] } [[SM]], i[[#SBITS]] [[EM]], 1 + ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[SM1]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] ; LEGACY: @"dfs$insert_struct" ; LEGACY: [[EM:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to i[[#SBITS]]*), align [[ALIGN:2]] @@ -173,15 +178,15 @@ define {i1, i1} @load_struct({i1, i1}* %p) { } define void @store_struct({i1, i1}* %p, {i1, i1} %s) { - ; FAST16: @"dfs$store_struct" - ; FAST16: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] - ; FAST16: [[E0:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[S]], 0 - ; FAST16: [[E1:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[S]], 1 - ; FAST16: [[E:%.*]] = or i[[#SBITS]] [[E0]], [[E1]] - ; FAST16: [[P0:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[P:%.*]], i32 0 - ; FAST16: store i[[#SBITS]] [[E]], i[[#SBITS]]* [[P0]], align [[#SBYTES]] - ; FAST16: [[P1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[P]], i32 1 - ; FAST16: store i[[#SBITS]] [[E]], i[[#SBITS]]* [[P1]], align [[#SBYTES]] + ; FAST: @"dfs$store_struct" + ; FAST: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] + ; FAST: [[E0:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[S]], 0 + ; FAST: [[E1:%.*]] = extractvalue { i[[#SBITS]], i[[#SBITS]] } [[S]], 1 + ; FAST: [[E:%.*]] = or i[[#SBITS]] [[E0]], [[E1]] + ; FAST: [[P0:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[P:%.*]], i32 0 + ; FAST: store i[[#SBITS]] [[E]], i[[#SBITS]]* [[P0]], align [[#SBYTES]] + ; FAST: [[P1:%.*]] = getelementptr i[[#SBITS]], i[[#SBITS]]* [[P]], i32 1 + ; FAST: store i[[#SBITS]] [[E]], i[[#SBITS]]* [[P1]], align [[#SBYTES]] ; EVENT_CALLBACKS: @"dfs$store_struct" ; EVENT_CALLBACKS: [[OL:%.*]] = or i[[#SBITS]] @@ -204,68 +209,68 @@ define void @store_struct({i1, i1}* %p, {i1, i1} %s) { } define i2 @extract_struct_of_aggregate11(%StructOfAggr %s) { - ; FAST16: @"dfs$extract_struct_of_aggregate11" - ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] - ; FAST16: [[E11:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 1, 1 - ; FAST16: store i[[#SBITS]] [[E11]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: @"dfs$extract_struct_of_aggregate11" + ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] + ; FAST: [[E11:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 1, 1 + ; FAST: store i[[#SBITS]] [[E11]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] %e11 = extractvalue %StructOfAggr %s, 1, 1 ret i2 %e11 } define [4 x i2] @extract_struct_of_aggregate1(%StructOfAggr %s) { - ; FAST16: @"dfs$extract_struct_of_aggregate1" - ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] - ; FAST16: [[E1:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 1 - ; FAST16: store [4 x i[[#SBITS]]] [[E1]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]] + ; FAST: @"dfs$extract_struct_of_aggregate1" + ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] + ; FAST: [[E1:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 1 + ; FAST: store [4 x i[[#SBITS]]] [[E1]], [4 x i[[#SBITS]]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to [4 x i[[#SBITS]]]*), align [[ALIGN]] %e1 = extractvalue %StructOfAggr %s, 1 ret [4 x i2] %e1 } define <4 x i3> @extract_struct_of_aggregate2(%StructOfAggr %s) { - ; FAST16: @"dfs$extract_struct_of_aggregate2" - ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] - ; FAST16: [[E2:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 2 - ; FAST16: store i[[#SBITS]] [[E2]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: @"dfs$extract_struct_of_aggregate2" + ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] + ; FAST: [[E2:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 2 + ; FAST: store i[[#SBITS]] [[E2]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] %e2 = extractvalue %StructOfAggr %s, 2 ret <4 x i3> %e2 } define { i1, i1 } @extract_struct_of_aggregate3(%StructOfAggr %s) { - ; FAST16: @"dfs$extract_struct_of_aggregate3" - ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] - ; FAST16: [[E3:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 3 - ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[E3]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: @"dfs$extract_struct_of_aggregate3" + ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] + ; FAST: [[E3:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 3 + ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[E3]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] %e3 = extractvalue %StructOfAggr %s, 3 ret { i1, i1 } %e3 } define i1 @extract_struct_of_aggregate31(%StructOfAggr %s) { - ; FAST16: @"dfs$extract_struct_of_aggregate31" - ; FAST16: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] - ; FAST16: [[E31:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 3, 1 - ; FAST16: store i[[#SBITS]] [[E31]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: @"dfs$extract_struct_of_aggregate31" + ; FAST: [[E:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN:2]] + ; FAST: [[E31:%.*]] = extractvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[E]], 3, 1 + ; FAST: store i[[#SBITS]] [[E31]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to i[[#SBITS]]*), align [[ALIGN]] %e31 = extractvalue %StructOfAggr %s, 3, 1 ret i1 %e31 } define %StructOfAggr @insert_struct_of_aggregate11(%StructOfAggr %s, i2 %e11) { - ; FAST16: @"dfs$insert_struct_of_aggregate11" - ; FAST16: [[E11:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(8, SBYTES)]]) to i[[#SBITS]]*), align [[ALIGN:2]] - ; FAST16: [[S:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]] - ; FAST16: [[S1:%.*]] = insertvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[S]], i[[#SBITS]] [[E11]], 1, 1 - ; FAST16: store { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[S1]], { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]] + ; FAST: @"dfs$insert_struct_of_aggregate11" + ; FAST: [[E11:%.*]] = load i[[#SBITS]], i[[#SBITS]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(8, SBYTES)]]) to i[[#SBITS]]*), align [[ALIGN:2]] + ; FAST: [[S:%.*]] = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]] + ; FAST: [[S1:%.*]] = insertvalue { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[S]], i[[#SBITS]] [[E11]], 1, 1 + ; FAST: store { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } [[S1]], { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]] %s1 = insertvalue %StructOfAggr %s, i2 %e11, 1, 1 ret %StructOfAggr %s1 } define {i8*, i32} @call_struct({i8*, i32} %s) { - ; FAST16: @"dfs$call_struct" - ; FAST16: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] - ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] - ; FAST16: %_dfsret = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] - ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } %_dfsret, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: @"dfs$call_struct" + ; FAST: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] + ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: %_dfsret = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: store { i[[#SBITS]], i[[#SBITS]] } %_dfsret, { i[[#SBITS]], i[[#SBITS]] }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] %r = call {i8*, i32} @pass_struct({i8*, i32} %s) ret {i8*, i32} %r @@ -274,15 +279,15 @@ define {i8*, i32} @call_struct({i8*, i32} %s) { declare %StructOfAggr @fun_with_many_aggr_args(<2 x i7> %v, [2 x i5] %a, {i3, i3} %s) define %StructOfAggr @call_many_aggr_args(<2 x i7> %v, [2 x i5] %a, {i3, i3} %s) { - ; FAST16: @"dfs$call_many_aggr_args" - ; FAST16: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] - ; FAST16: [[A:%.*]] = load [2 x i[[#SBITS]]], [2 x i[[#SBITS]]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to [2 x i[[#SBITS]]]*), align [[ALIGN]] - ; FAST16: [[V:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] - ; FAST16: store i[[#SBITS]] [[V]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] - ; FAST16: store [2 x i[[#SBITS]]] [[A]], [2 x i[[#SBITS]]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to [2 x i[[#SBITS]]]*), align [[ALIGN]] - ; FAST16: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] - ; FAST16: %_dfsret = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]] - ; FAST16: store { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } %_dfsret, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]] + ; FAST: @"dfs$call_many_aggr_args" + ; FAST: [[S:%.*]] = load { i[[#SBITS]], i[[#SBITS]] }, { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN:2]] + ; FAST: [[A:%.*]] = load [2 x i[[#SBITS]]], [2 x i[[#SBITS]]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to [2 x i[[#SBITS]]]*), align [[ALIGN]] + ; FAST: [[V:%.*]] = load i[[#SBITS]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: store i[[#SBITS]] [[V]], i[[#SBITS]]* bitcast ([[TLS_ARR]]* @__dfsan_arg_tls to i[[#SBITS]]*), align [[ALIGN]] + ; FAST: store [2 x i[[#SBITS]]] [[A]], [2 x i[[#SBITS]]]* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 2) to [2 x i[[#SBITS]]]*), align [[ALIGN]] + ; FAST: store { i[[#SBITS]], i[[#SBITS]] } [[S]], { i[[#SBITS]], i[[#SBITS]] }* inttoptr (i64 add (i64 ptrtoint ([[TLS_ARR]]* @__dfsan_arg_tls to i64), i64 [[#mul(2, SBYTES) + 2]]) to { i[[#SBITS]], i[[#SBITS]] }*), align [[ALIGN]] + ; FAST: %_dfsret = load { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]] + ; FAST: store { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } } %_dfsret, { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }* bitcast ([[TLS_ARR]]* @__dfsan_retval_tls to { i[[#SBITS]], [4 x i[[#SBITS]]], i[[#SBITS]], { i[[#SBITS]], i[[#SBITS]] } }*), align [[ALIGN]] %r = call %StructOfAggr @fun_with_many_aggr_args(<2 x i7> %v, [2 x i5] %a, {i3, i3} %s) ret %StructOfAggr %r diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll b/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll index 0a2d29c68bdd..51de5f620c71 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/vector.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -dfsan -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_LEGACY ; RUN: opt < %s -dfsan -dfsan-args-abi -S | FileCheck %s --check-prefixes=CHECK,ARGS_ABI ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_FAST +; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK,TLS_ABI,TLS_ABI_FAST target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -- GitLab From c9861f722e375c419a07bcb70c54fe1384cd2999 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 18 Mar 2021 16:22:19 -0700 Subject: [PATCH 0098/1000] [RISCV] Correct the output chain in lowerFixedLengthVectorMaskedLoadToRVV We returned the input chain instead of the output chain from the new load. This bypasses the load in the chain. I haven't found a good way to test this yet. IR order prevents my initial attempts at causing reordering. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index bea946daa473..8c085425eb0a 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3217,7 +3217,7 @@ SDValue RISCVTargetLowering::lowerFixedLengthVectorMaskedLoadToRVV( Load->getMemoryVT(), Load->getMemOperand()); SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget); - return DAG.getMergeValues({Result, Load->getChain()}, DL); + return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL); } SDValue RISCVTargetLowering::lowerFixedLengthVectorMaskedStoreToRVV( -- GitLab From 9558456b5370e64560e76f6580b979fccadd4744 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 18 Mar 2021 16:46:04 -0700 Subject: [PATCH 0099/1000] [SanitizerCoverage] Make __start_/__stop_ symbols extern_weak On ELF, we place the metadata sections (`__sancov_guards`, `__sancov_cntrs`, `__sancov_bools`, `__sancov_pcs` in section groups (either `comdat any` or `comdat noduplicates`). With `--gc-sections`, LLD since D96753 and GNU ld `-z start-stop-gc` may garbage collect such sections. If all `__sancov_bools` are discarded, LLD will error `error: undefined hidden symbol: __start___sancov_cntrs` (other sections are similar). ``` % cat a.c void discarded() {} % clang -fsanitize-coverage=func,trace-pc-guard -fpic -fvisibility=hidden a.c -shared -fuse-ld=lld -Wl,--gc-sections ... ld.lld: error: undefined hidden symbol: __start___sancov_guards >>> referenced by a.c >>> /tmp/a-456662.o:(sancov.module_ctor_trace_pc_guard) ``` Use the `extern_weak` linkage (lowered to undefined weak symbols) to avoid the undefined error. Differential Revision: https://reviews.llvm.org/D98903 --- .../Transforms/Instrumentation/SanitizerCoverage.cpp | 10 ++++++---- .../SanitizerCoverage/inline-8bit-counters.ll | 5 ++++- .../SanitizerCoverage/inline-bool-flag.ll | 2 ++ .../test/Instrumentation/SanitizerCoverage/pc-table.ll | 2 ++ 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index 843a27faf168..16ba84fdd00b 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -328,13 +328,15 @@ PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M, std::pair ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section, Type *Ty) { + // Use ExternalWeak so that if all sections are discarded due to section + // garbage collection, the linker will not report undefined symbol errors. GlobalVariable *SecStart = new GlobalVariable( - M, Ty->getPointerElementType(), false, GlobalVariable::ExternalLinkage, - nullptr, getSectionStart(Section)); + M, Ty->getPointerElementType(), false, + GlobalVariable::ExternalWeakLinkage, nullptr, getSectionStart(Section)); SecStart->setVisibility(GlobalValue::HiddenVisibility); GlobalVariable *SecEnd = new GlobalVariable( - M, Ty->getPointerElementType(), false, GlobalVariable::ExternalLinkage, - nullptr, getSectionEnd(Section)); + M, Ty->getPointerElementType(), false, + GlobalVariable::ExternalWeakLinkage, nullptr, getSectionEnd(Section)); SecEnd->setVisibility(GlobalValue::HiddenVisibility); IRBuilder<> IRB(M.getContext()); if (!TargetTriple.isOSBinFormatCOFF()) diff --git a/llvm/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll b/llvm/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll index 4f905428769a..3611cba34ce5 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/inline-8bit-counters.ll @@ -2,11 +2,14 @@ ; RUN: opt < %s -sancov -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -S -enable-new-pm=0 | FileCheck %s ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-inline-8bit-counters=1 -S | FileCheck %s +; CHECK: @__sancov_gen_ = private global [1 x i8] zeroinitializer, section "__sancov_cntrs", comdat($foo), align 1 +; CHECK: @__start___sancov_cntrs = extern_weak hidden global i8 +; CHECK-NEXT: @__stop___sancov_cntrs = extern_weak hidden global i8 + target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" target triple = "x86_64-unknown-linux-gnu" define void @foo() { entry: -; CHECK: section "__sancov_cntrs", comdat($foo), align 1 ; CHECK: %0 = load i8, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__sancov_gen_, i64 0, i64 0), align 1, !nosanitize ; CHECK: %1 = add i8 %0, 1 ; CHECK: store i8 %1, i8* getelementptr inbounds ([1 x i8], [1 x i8]* @__sancov_gen_, i64 0, i64 0), align 1, !nosanitize diff --git a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll index e711d96a5a43..48a4c60d98b2 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/inline-bool-flag.ll @@ -4,6 +4,8 @@ ; CHECK: $foo = comdat noduplicates ; CHECK: @__sancov_gen_ = private global [1 x i1] zeroinitializer, section "__sancov_bools", comdat($foo), align 1{{$}} +; CHECK: @__start___sancov_bools = extern_weak hidden global i1 +; CHECK-NEXT: @__stop___sancov_bools = extern_weak hidden global i1 ; CHECK-NOT: @llvm.used = ; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* bitcast ([1 x i1]* @__sancov_gen_ to i8*)], section "llvm.metadata" diff --git a/llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll b/llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll index eeeb56bfb2ab..8ebb6a05ca7c 100644 --- a/llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll +++ b/llvm/test/Instrumentation/SanitizerCoverage/pc-table.ll @@ -22,6 +22,8 @@ entry: } ; CHECK: private constant [6 x i64*] [{{.*}}@foo{{.*}}blockaddress{{.*}}blockaddress{{.*}}], section "__sancov_pcs", comdat($foo), align 8 +; CHECK: @__start___sancov_pcs = extern_weak hidden global i64 +; CHECK-NEXT: @__stop___sancov_pcs = extern_weak hidden global i64 ; CHECK: define internal void @sancov.module_ctor ; CHECK: call void @__sanitizer_cov ; CHECK: call void @__sanitizer_cov_pcs_init -- GitLab From aa8d33a6d6346e1ed444a59d0655f4a43ba96875 Mon Sep 17 00:00:00 2001 From: Hsiangkai Wang Date: Mon, 15 Mar 2021 13:58:11 +0800 Subject: [PATCH 0100/1000] [RISCV] Spilling for Zvlsseg registers. For Zvlsseg, we create several tuple register classes. When spilling for these tuple register classes, we need to iterate NF times to load/store these tuple registers. Differential Revision: https://reviews.llvm.org/D98629 --- .../Target/RISCV/RISCVExpandPseudoInsts.cpp | 116 +++++++ llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 165 ++++++++-- llvm/lib/Target/RISCV/RISCVInstrInfo.h | 3 + .../Target/RISCV/RISCVInstrInfoVPseudos.td | 14 + llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 13 +- .../CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll | 299 ++++++++++++++++++ .../CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll | 299 ++++++++++++++++++ 7 files changed, 879 insertions(+), 30 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll create mode 100644 llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp index ec9a39569952..581f26c64abc 100644 --- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp +++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp @@ -62,6 +62,8 @@ private: bool expandVSetVL(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool expandVMSET_VMCLR(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned Opcode); + bool expandVSPILL(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); + bool expandVRELOAD(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); }; char RISCVExpandPseudo::ID = 0; @@ -123,6 +125,30 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB, case RISCV::PseudoVMSET_M_B64: // vmset.m vd => vmxnor.mm vd, vd, vd return expandVMSET_VMCLR(MBB, MBBI, RISCV::VMXNOR_MM); + case RISCV::PseudoVSPILL2_M1: + case RISCV::PseudoVSPILL2_M2: + case RISCV::PseudoVSPILL2_M4: + case RISCV::PseudoVSPILL3_M1: + case RISCV::PseudoVSPILL3_M2: + case RISCV::PseudoVSPILL4_M1: + case RISCV::PseudoVSPILL4_M2: + case RISCV::PseudoVSPILL5_M1: + case RISCV::PseudoVSPILL6_M1: + case RISCV::PseudoVSPILL7_M1: + case RISCV::PseudoVSPILL8_M1: + return expandVSPILL(MBB, MBBI); + case RISCV::PseudoVRELOAD2_M1: + case RISCV::PseudoVRELOAD2_M2: + case RISCV::PseudoVRELOAD2_M4: + case RISCV::PseudoVRELOAD3_M1: + case RISCV::PseudoVRELOAD3_M2: + case RISCV::PseudoVRELOAD4_M1: + case RISCV::PseudoVRELOAD4_M2: + case RISCV::PseudoVRELOAD5_M1: + case RISCV::PseudoVRELOAD6_M1: + case RISCV::PseudoVRELOAD7_M1: + case RISCV::PseudoVRELOAD8_M1: + return expandVRELOAD(MBB, MBBI); } return false; @@ -253,6 +279,96 @@ bool RISCVExpandPseudo::expandVMSET_VMCLR(MachineBasicBlock &MBB, return true; } +bool RISCVExpandPseudo::expandVSPILL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + DebugLoc DL = MBBI->getDebugLoc(); + Register SrcReg = MBBI->getOperand(0).getReg(); + Register Base = MBBI->getOperand(1).getReg(); + Register VL = MBBI->getOperand(2).getReg(); + auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MBBI->getOpcode()); + if (!ZvlssegInfo) + return false; + unsigned NF = ZvlssegInfo->first; + unsigned LMUL = ZvlssegInfo->second; + assert(NF * LMUL <= 8 && "Invalid NF/LMUL combinations."); + unsigned Opcode = RISCV::VS1R_V; + unsigned SubRegIdx = RISCV::sub_vrm1_0; + static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7, + "Unexpected subreg numbering"); + if (LMUL == 2) { + Opcode = RISCV::VS2R_V; + SubRegIdx = RISCV::sub_vrm2_0; + static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3, + "Unexpected subreg numbering"); + } else if (LMUL == 4) { + Opcode = RISCV::VS4R_V; + SubRegIdx = RISCV::sub_vrm4_0; + static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1, + "Unexpected subreg numbering"); + } else + assert(LMUL == 1 && "LMUL must be 1, 2, or 4."); + + for (unsigned I = 0; I < NF; ++I) { + BuildMI(MBB, MBBI, DL, TII->get(Opcode)) + .addReg(TRI->getSubReg(SrcReg, SubRegIdx + I)) + .addReg(Base) + .addMemOperand(*(MBBI->memoperands_begin())); + if (I != NF - 1) + BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADD), Base) + .addReg(Base) + .addReg(VL); + } + MBBI->eraseFromParent(); + return true; +} + +bool RISCVExpandPseudo::expandVRELOAD(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) { + const TargetRegisterInfo *TRI = + MBB.getParent()->getSubtarget().getRegisterInfo(); + DebugLoc DL = MBBI->getDebugLoc(); + Register DestReg = MBBI->getOperand(0).getReg(); + Register Base = MBBI->getOperand(1).getReg(); + Register VL = MBBI->getOperand(2).getReg(); + auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MBBI->getOpcode()); + if (!ZvlssegInfo) + return false; + unsigned NF = ZvlssegInfo->first; + unsigned LMUL = ZvlssegInfo->second; + assert(NF * LMUL <= 8 && "Invalid NF/LMUL combinations."); + unsigned Opcode = RISCV::VL1RE8_V; + unsigned SubRegIdx = RISCV::sub_vrm1_0; + static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7, + "Unexpected subreg numbering"); + if (LMUL == 2) { + Opcode = RISCV::VL2RE8_V; + SubRegIdx = RISCV::sub_vrm2_0; + static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3, + "Unexpected subreg numbering"); + } else if (LMUL == 4) { + Opcode = RISCV::VL4RE8_V; + SubRegIdx = RISCV::sub_vrm4_0; + static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1, + "Unexpected subreg numbering"); + } else + assert(LMUL == 1 && "LMUL must be 1, 2, or 4."); + + for (unsigned I = 0; I < NF; ++I) { + BuildMI(MBB, MBBI, DL, TII->get(Opcode), + TRI->getSubReg(DestReg, SubRegIdx + I)) + .addReg(Base) + .addMemOperand(*(MBBI->memoperands_begin())); + if (I != NF - 1) + BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADD), Base) + .addReg(Base) + .addReg(VL); + } + MBBI->eraseFromParent(); + return true; +} + } // end of anonymous namespace INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo", diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp index a2ce3597be8f..7d205d76b55c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp @@ -167,29 +167,56 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFrameInfo &MFI = MF->getFrameInfo(); unsigned Opcode; - bool IsScalableVector = false; - if (RISCV::GPRRegClass.hasSubClassEq(RC)) + bool IsScalableVector = true; + bool IsZvlsseg = true; + if (RISCV::GPRRegClass.hasSubClassEq(RC)) { Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::SW : RISCV::SD; - else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) + IsScalableVector = false; + } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FSH; - else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) + IsScalableVector = false; + } else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FSW; - else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) + IsScalableVector = false; + } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FSD; - else if (RISCV::VRRegClass.hasSubClassEq(RC)) { + IsScalableVector = false; + } else if (RISCV::VRRegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoVSPILL_M1; - IsScalableVector = true; + IsZvlsseg = false; } else if (RISCV::VRM2RegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoVSPILL_M2; - IsScalableVector = true; + IsZvlsseg = false; } else if (RISCV::VRM4RegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoVSPILL_M4; - IsScalableVector = true; + IsZvlsseg = false; } else if (RISCV::VRM8RegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoVSPILL_M8; - IsScalableVector = true; - } else + IsZvlsseg = false; + } else if (RISCV::VRN2M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL2_M1; + else if (RISCV::VRN2M2RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL2_M2; + else if (RISCV::VRN2M4RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL2_M4; + else if (RISCV::VRN3M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL3_M1; + else if (RISCV::VRN3M2RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL3_M2; + else if (RISCV::VRN4M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL4_M1; + else if (RISCV::VRN4M2RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL4_M2; + else if (RISCV::VRN5M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL5_M1; + else if (RISCV::VRN6M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL6_M1; + else if (RISCV::VRN7M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL7_M1; + else if (RISCV::VRN8M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVSPILL8_M1; + else llvm_unreachable("Can't store this register to stack slot"); if (IsScalableVector) { @@ -198,10 +225,16 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MemoryLocation::UnknownSize, MFI.getObjectAlign(FI)); MFI.setStackID(FI, TargetStackID::ScalableVector); - BuildMI(MBB, I, DL, get(Opcode)) - .addReg(SrcReg, getKillRegState(IsKill)) - .addFrameIndex(FI) - .addMemOperand(MMO); + auto MIB = BuildMI(MBB, I, DL, get(Opcode)) + .addReg(SrcReg, getKillRegState(IsKill)) + .addFrameIndex(FI) + .addMemOperand(MMO); + if (IsZvlsseg) { + // For spilling/reloading Zvlsseg registers, append the dummy field for + // the scaled vector length. The argument will be used when expanding + // these pseudo instructions. + MIB.addReg(RISCV::X0); + } } else { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, @@ -228,29 +261,56 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineFrameInfo &MFI = MF->getFrameInfo(); unsigned Opcode; - bool IsScalableVector = false; - if (RISCV::GPRRegClass.hasSubClassEq(RC)) + bool IsScalableVector = true; + bool IsZvlsseg = true; + if (RISCV::GPRRegClass.hasSubClassEq(RC)) { Opcode = TRI->getRegSizeInBits(RISCV::GPRRegClass) == 32 ? RISCV::LW : RISCV::LD; - else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) + IsScalableVector = false; + } else if (RISCV::FPR16RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FLH; - else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) + IsScalableVector = false; + } else if (RISCV::FPR32RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FLW; - else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) + IsScalableVector = false; + } else if (RISCV::FPR64RegClass.hasSubClassEq(RC)) { Opcode = RISCV::FLD; - else if (RISCV::VRRegClass.hasSubClassEq(RC)) { + IsScalableVector = false; + } else if (RISCV::VRRegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoVRELOAD_M1; - IsScalableVector = true; + IsZvlsseg = false; } else if (RISCV::VRM2RegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoVRELOAD_M2; - IsScalableVector = true; + IsZvlsseg = false; } else if (RISCV::VRM4RegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoVRELOAD_M4; - IsScalableVector = true; + IsZvlsseg = false; } else if (RISCV::VRM8RegClass.hasSubClassEq(RC)) { Opcode = RISCV::PseudoVRELOAD_M8; - IsScalableVector = true; - } else + IsZvlsseg = false; + } else if (RISCV::VRN2M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD2_M1; + else if (RISCV::VRN2M2RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD2_M2; + else if (RISCV::VRN2M4RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD2_M4; + else if (RISCV::VRN3M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD3_M1; + else if (RISCV::VRN3M2RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD3_M2; + else if (RISCV::VRN4M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD4_M1; + else if (RISCV::VRN4M2RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD4_M2; + else if (RISCV::VRN5M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD5_M1; + else if (RISCV::VRN6M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD6_M1; + else if (RISCV::VRN7M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD7_M1; + else if (RISCV::VRN8M1RegClass.hasSubClassEq(RC)) + Opcode = RISCV::PseudoVRELOAD8_M1; + else llvm_unreachable("Can't load this register from stack slot"); if (IsScalableVector) { @@ -259,9 +319,15 @@ void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MemoryLocation::UnknownSize, MFI.getObjectAlign(FI)); MFI.setStackID(FI, TargetStackID::ScalableVector); - BuildMI(MBB, I, DL, get(Opcode), DstReg) - .addFrameIndex(FI) - .addMemOperand(MMO); + auto MIB = BuildMI(MBB, I, DL, get(Opcode), DstReg) + .addFrameIndex(FI) + .addMemOperand(MMO); + if (IsZvlsseg) { + // For spilling/reloading Zvlsseg registers, append the dummy field for + // the scaled vector length. The argument will be used when expanding + // these pseudo instructions. + MIB.addReg(RISCV::X0); + } } else { MachineMemOperand *MMO = MF->getMachineMemOperand( MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, @@ -1217,3 +1283,44 @@ Register RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF, return FactorRegister; } + +Optional> +RISCVInstrInfo::isRVVSpillForZvlsseg(unsigned Opcode) const { + switch (Opcode) { + default: + return None; + case RISCV::PseudoVSPILL2_M1: + case RISCV::PseudoVRELOAD2_M1: + return std::make_pair(2u, 1u); + case RISCV::PseudoVSPILL2_M2: + case RISCV::PseudoVRELOAD2_M2: + return std::make_pair(2u, 2u); + case RISCV::PseudoVSPILL2_M4: + case RISCV::PseudoVRELOAD2_M4: + return std::make_pair(2u, 4u); + case RISCV::PseudoVSPILL3_M1: + case RISCV::PseudoVRELOAD3_M1: + return std::make_pair(3u, 1u); + case RISCV::PseudoVSPILL3_M2: + case RISCV::PseudoVRELOAD3_M2: + return std::make_pair(3u, 2u); + case RISCV::PseudoVSPILL4_M1: + case RISCV::PseudoVRELOAD4_M1: + return std::make_pair(4u, 1u); + case RISCV::PseudoVSPILL4_M2: + case RISCV::PseudoVRELOAD4_M2: + return std::make_pair(4u, 2u); + case RISCV::PseudoVSPILL5_M1: + case RISCV::PseudoVRELOAD5_M1: + return std::make_pair(5u, 1u); + case RISCV::PseudoVSPILL6_M1: + case RISCV::PseudoVRELOAD6_M1: + return std::make_pair(6u, 1u); + case RISCV::PseudoVSPILL7_M1: + case RISCV::PseudoVRELOAD7_M1: + return std::make_pair(7u, 1u); + case RISCV::PseudoVSPILL8_M1: + case RISCV::PseudoVRELOAD8_M1: + return std::make_pair(8u, 1u); + } +} diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h index f15d61ede037..ae03d121f42d 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h @@ -147,6 +147,9 @@ public: MachineBasicBlock::iterator II, int64_t Amount) const; + Optional> + isRVVSpillForZvlsseg(unsigned Opcode) const; + protected: const RISCVSubtarget &STI; }; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 006703e97f6d..583b6393581f 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -3171,6 +3171,20 @@ let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in { def PseudoVRELOAD_M8 : VPseudo; } +foreach lmul = MxList.m in { + foreach nf = NFSet.L in { + defvar vreg = SegRegClass.RC; + let hasSideEffects = 0, mayLoad = 0, mayStore = 1, isCodeGenOnly = 1 in { + def "PseudoVSPILL" # nf # "_" # lmul.MX : + Pseudo<(outs), (ins vreg:$rs1, GPR:$rs2, GPR:$vlenb), []>; + } + let hasSideEffects = 0, mayLoad = 1, mayStore = 0, isCodeGenOnly = 1 in { + def "PseudoVRELOAD" # nf # "_" # lmul.MX : + Pseudo<(outs vreg:$rs1), (ins GPR:$rs2, GPR:$vlenb), []>; + } + } +} + //===----------------------------------------------------------------------===// // 6. Configuration-Setting Instructions //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index e1cd29c49158..ad6d3af21d58 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -195,7 +195,8 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, StackOffset Offset = getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg); bool isRVV = RISCVVPseudosTable::getPseudoInfo(MI.getOpcode()) || - isRVVWholeLoadStore(MI.getOpcode()); + isRVVWholeLoadStore(MI.getOpcode()) || + TII->isRVVSpillForZvlsseg(MI.getOpcode()); if (!isRVV) Offset += StackOffset::getFixed(MI.getOperand(FIOperandNum + 1).getImm()); @@ -268,6 +269,16 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, if (!isRVV) MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset.getFixed()); } + + MachineFrameInfo &MFI = MF.getFrameInfo(); + auto ZvlssegInfo = TII->isRVVSpillForZvlsseg(MI.getOpcode()); + if (ZvlssegInfo) { + int64_t ScalableValue = MFI.getObjectSize(FrameIndex) / ZvlssegInfo->first; + Register FactorRegister = + TII->getVLENFactoredAmount(MF, MBB, II, ScalableValue); + MI.getOperand(FIOperandNum + 1) + .ChangeToRegister(FactorRegister, /*isDef=*/false); + } } Register RISCVRegisterInfo::getFrameRegister(const MachineFunction &MF) const { diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll new file mode 100644 index 000000000000..d549c03d9d02 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll @@ -0,0 +1,299 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -mattr=+m -O0 < %s \ +; RUN: | FileCheck --check-prefix=SPILL-O0 %s +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -mattr=+m -O2 < %s \ +; RUN: | FileCheck --check-prefix=SPILL-O2 %s + +define @spill_zvlsseg_nxv1i32(i32* %base, i32 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg_nxv1i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,mf2,ta,mu +; SPILL-O0-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O0-NEXT: vmv1r.v v25, v1 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs1r.v v25, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg_nxv1i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: slli a2, a2, 1 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,mf2,ta,mu +; SPILL-O2-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: vl1r.v v7, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8 killed $v8 killed $v7_v8 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 1 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,} @llvm.riscv.vlseg2.nxv1i32(i32* %base, i32 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,} %0, 1 + ret %1 +} + +define @spill_zvlsseg_nxv2i32(i32* %base, i32 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg_nxv2i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; SPILL-O0-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O0-NEXT: vmv1r.v v25, v1 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs1r.v v25, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg_nxv2i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: slli a2, a2, 1 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; SPILL-O2-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: vl1r.v v7, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8 killed $v8 killed $v7_v8 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 1 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,} @llvm.riscv.vlseg2.nxv2i32(i32* %base, i32 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,} %0, 1 + ret %1 +} + +define @spill_zvlsseg_nxv4i32(i32* %base, i32 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg_nxv4i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: slli a2, a2, 1 +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; SPILL-O0-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O0-NEXT: vmv2r.v v26, v2 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs2r.v v26, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl2re8.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: slli a0, a0, 1 +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg_nxv4i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: slli a2, a2, 2 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; SPILL-O2-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 1 +; SPILL-O2-NEXT: vs2r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs2r.v v2, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 1 +; SPILL-O2-NEXT: vl2r.v v6, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8m2 killed $v8m2 killed $v6m2_v8m2 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 2 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,} @llvm.riscv.vlseg2.nxv4i32(i32* %base, i32 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,} %0, 1 + ret %1 +} + +define @spill_zvlsseg_nxv8i32(i32* %base, i32 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg_nxv8i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: slli a2, a2, 2 +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,m4,ta,mu +; SPILL-O0-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O0-NEXT: vmv4r.v v28, v4 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs4r.v v28, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl4re8.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: slli a0, a0, 2 +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg_nxv8i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: slli a2, a2, 3 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,m4,ta,mu +; SPILL-O2-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 2 +; SPILL-O2-NEXT: vs4r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs4r.v v4, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 2 +; SPILL-O2-NEXT: vl4r.v v4, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8m4 killed $v8m4 killed $v4m4_v8m4 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 3 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,} @llvm.riscv.vlseg2.nxv8i32(i32* %base, i32 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,} %0, 1 + ret %1 +} + +define @spill_zvlsseg3_nxv4i32(i32* %base, i32 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg3_nxv4i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: slli a2, a2, 1 +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; SPILL-O0-NEXT: vlseg3e32.v v0, (a0) +; SPILL-O0-NEXT: vmv2r.v v26, v2 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs2r.v v26, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl2re8.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: slli a0, a0, 1 +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg3_nxv4i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: addi a3, zero, 6 +; SPILL-O2-NEXT: mul a2, a2, a3 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; SPILL-O2-NEXT: vlseg3e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 1 +; SPILL-O2-NEXT: vs2r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs2r.v v2, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs2r.v v4, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 1 +; SPILL-O2-NEXT: vl2r.v v6, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8m2 killed $v8m2 killed $v6m2_v8m2_v10m2 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: addi a1, zero, 6 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,,} @llvm.riscv.vlseg3.nxv4i32(i32* %base, i32 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,,} %0, 1 + ret %1 +} + +declare {,} @llvm.riscv.vlseg2.nxv1i32(i32* , i32) +declare {,} @llvm.riscv.vlseg2.nxv2i32(i32* , i32) +declare {,} @llvm.riscv.vlseg2.nxv4i32(i32* , i32) +declare {,} @llvm.riscv.vlseg2.nxv8i32(i32* , i32) +declare {,,} @llvm.riscv.vlseg3.nxv4i32(i32* , i32) diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll new file mode 100644 index 000000000000..bbda9980380b --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll @@ -0,0 +1,299 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -mattr=+m -O0 < %s \ +; RUN: | FileCheck --check-prefix=SPILL-O0 %s +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -mattr=+m -O2 < %s \ +; RUN: | FileCheck --check-prefix=SPILL-O2 %s + +define @spill_zvlsseg_nxv1i32(i32* %base, i64 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg_nxv1i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,mf2,ta,mu +; SPILL-O0-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O0-NEXT: vmv1r.v v25, v1 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs1r.v v25, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg_nxv1i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: slli a2, a2, 1 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,mf2,ta,mu +; SPILL-O2-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: vl1r.v v7, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8 killed $v8 killed $v7_v8 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 1 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,} @llvm.riscv.vlseg2.nxv1i32(i32* %base, i64 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,} %0, 1 + ret %1 +} + +define @spill_zvlsseg_nxv2i32(i32* %base, i64 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg_nxv2i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; SPILL-O0-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O0-NEXT: vmv1r.v v25, v1 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs1r.v v25, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg_nxv2i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: slli a2, a2, 1 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,m1,ta,mu +; SPILL-O2-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs1r.v v1, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: vl1r.v v7, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8 killed $v8 killed $v7_v8 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 1 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,} @llvm.riscv.vlseg2.nxv2i32(i32* %base, i64 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,} %0, 1 + ret %1 +} + +define @spill_zvlsseg_nxv4i32(i32* %base, i64 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg_nxv4i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: slli a2, a2, 1 +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; SPILL-O0-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O0-NEXT: vmv2r.v v26, v2 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs2r.v v26, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl2re8.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: slli a0, a0, 1 +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg_nxv4i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: slli a2, a2, 2 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; SPILL-O2-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 1 +; SPILL-O2-NEXT: vs2r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs2r.v v2, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 1 +; SPILL-O2-NEXT: vl2r.v v6, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8m2 killed $v8m2 killed $v6m2_v8m2 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 2 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,} @llvm.riscv.vlseg2.nxv4i32(i32* %base, i64 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,} %0, 1 + ret %1 +} + +define @spill_zvlsseg_nxv8i32(i32* %base, i64 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg_nxv8i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: slli a2, a2, 2 +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,m4,ta,mu +; SPILL-O0-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O0-NEXT: vmv4r.v v28, v4 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs4r.v v28, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl4re8.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: slli a0, a0, 2 +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg_nxv8i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: slli a2, a2, 3 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,m4,ta,mu +; SPILL-O2-NEXT: vlseg2e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 2 +; SPILL-O2-NEXT: vs4r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs4r.v v4, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 2 +; SPILL-O2-NEXT: vl4r.v v4, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl4r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8m4 killed $v8m4 killed $v4m4_v8m4 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: slli a0, a0, 3 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,} @llvm.riscv.vlseg2.nxv8i32(i32* %base, i64 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,} %0, 1 + ret %1 +} + +define @spill_zvlsseg3_nxv4i32(i32* %base, i64 %vl) nounwind { +; SPILL-O0-LABEL: spill_zvlsseg3_nxv4i32: +; SPILL-O0: # %bb.0: # %entry +; SPILL-O0-NEXT: addi sp, sp, -16 +; SPILL-O0-NEXT: csrr a2, vlenb +; SPILL-O0-NEXT: slli a2, a2, 1 +; SPILL-O0-NEXT: sub sp, sp, a2 +; SPILL-O0-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; SPILL-O0-NEXT: vlseg3e32.v v0, (a0) +; SPILL-O0-NEXT: vmv2r.v v26, v2 +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vs2r.v v26, (a0) # Unknown-size Folded Spill +; SPILL-O0-NEXT: #APP +; SPILL-O0-NEXT: #NO_APP +; SPILL-O0-NEXT: addi a0, sp, 16 +; SPILL-O0-NEXT: vl2re8.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O0-NEXT: csrr a0, vlenb +; SPILL-O0-NEXT: slli a0, a0, 1 +; SPILL-O0-NEXT: add sp, sp, a0 +; SPILL-O0-NEXT: addi sp, sp, 16 +; SPILL-O0-NEXT: ret +; +; SPILL-O2-LABEL: spill_zvlsseg3_nxv4i32: +; SPILL-O2: # %bb.0: # %entry +; SPILL-O2-NEXT: addi sp, sp, -16 +; SPILL-O2-NEXT: csrr a2, vlenb +; SPILL-O2-NEXT: addi a3, zero, 6 +; SPILL-O2-NEXT: mul a2, a2, a3 +; SPILL-O2-NEXT: sub sp, sp, a2 +; SPILL-O2-NEXT: vsetvli a1, a1, e32,m2,ta,mu +; SPILL-O2-NEXT: vlseg3e32.v v0, (a0) +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 1 +; SPILL-O2-NEXT: vs2r.v v0, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs2r.v v2, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vs2r.v v4, (a0) # Unknown-size Folded Spill +; SPILL-O2-NEXT: #APP +; SPILL-O2-NEXT: #NO_APP +; SPILL-O2-NEXT: addi a0, sp, 16 +; SPILL-O2-NEXT: csrr a1, vlenb +; SPILL-O2-NEXT: slli a1, a1, 1 +; SPILL-O2-NEXT: vl2r.v v6, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl2r.v v8, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: add a0, a0, a1 +; SPILL-O2-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload +; SPILL-O2-NEXT: # kill: def $v8m2 killed $v8m2 killed $v6m2_v8m2_v10m2 +; SPILL-O2-NEXT: csrr a0, vlenb +; SPILL-O2-NEXT: addi a1, zero, 6 +; SPILL-O2-NEXT: mul a0, a0, a1 +; SPILL-O2-NEXT: add sp, sp, a0 +; SPILL-O2-NEXT: addi sp, sp, 16 +; SPILL-O2-NEXT: ret +entry: + %0 = tail call {,,} @llvm.riscv.vlseg3.nxv4i32(i32* %base, i64 %vl) + call void asm sideeffect "", + "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() + %1 = extractvalue {,,} %0, 1 + ret %1 +} + +declare {,} @llvm.riscv.vlseg2.nxv1i32(i32* , i64) +declare {,} @llvm.riscv.vlseg2.nxv2i32(i32* , i64) +declare {,} @llvm.riscv.vlseg2.nxv4i32(i32* , i64) +declare {,} @llvm.riscv.vlseg2.nxv8i32(i32* , i64) +declare {,,} @llvm.riscv.vlseg3.nxv4i32(i32* , i64) -- GitLab From cbab2cd6bf77f121c0d8a46abf607895b2911a20 Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Thu, 18 Mar 2021 17:13:50 -0700 Subject: [PATCH 0101/1000] [WebAssembly] Remove experimental instructions from wasm_simd128.h These experimental builtin functions and the feature macro they were gated behind have been removed. Reviewed By: aheejin Differential Revision: https://reviews.llvm.org/D98907 --- clang/lib/Headers/wasm_simd128.h | 48 -------------------------------- 1 file changed, 48 deletions(-) diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index 20f5a85b3224..eb2a42f303b6 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -825,18 +825,6 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_neg(v128_t __a) { return (v128_t)(-(__u64x2)__a); } -#ifdef __wasm_unimplemented_simd128__ - -static __inline__ bool __DEFAULT_FN_ATTRS wasm_i64x2_any_true(v128_t __a) { - return __builtin_wasm_any_true_i64x2((__i64x2)__a); -} - -static __inline__ bool __DEFAULT_FN_ATTRS wasm_i64x2_all_true(v128_t __a) { - return __builtin_wasm_all_true_i64x2((__i64x2)__a); -} - -#endif // __wasm_unimplemented_simd128__ - static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_i64x2_shl(v128_t __a, int32_t __b) { return (v128_t)((__i64x2)__a << (int64_t)__b); @@ -879,24 +867,6 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_sqrt(v128_t __a) { return (v128_t)__builtin_wasm_sqrt_f32x4((__f32x4)__a); } -#ifdef __wasm_unimplemented_simd128__ - -static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_qfma(v128_t __a, - v128_t __b, - v128_t __c) { - return (v128_t)__builtin_wasm_qfma_f32x4((__f32x4)__a, (__f32x4)__b, - (__f32x4)__c); -} - -static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_qfms(v128_t __a, - v128_t __b, - v128_t __c) { - return (v128_t)__builtin_wasm_qfms_f32x4((__f32x4)__a, (__f32x4)__b, - (__f32x4)__c); -} - -#endif // __wasm_unimplemented_simd128__ - static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f32x4_add(v128_t __a, v128_t __b) { return (v128_t)((__f32x4)__a + (__f32x4)__b); @@ -949,24 +919,6 @@ static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_sqrt(v128_t __a) { return (v128_t)__builtin_wasm_sqrt_f64x2((__f64x2)__a); } -#ifdef __wasm_unimplemented_simd128__ - -static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_qfma(v128_t __a, - v128_t __b, - v128_t __c) { - return (v128_t)__builtin_wasm_qfma_f64x2((__f64x2)__a, (__f64x2)__b, - (__f64x2)__c); -} - -static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_qfms(v128_t __a, - v128_t __b, - v128_t __c) { - return (v128_t)__builtin_wasm_qfms_f64x2((__f64x2)__a, (__f64x2)__b, - (__f64x2)__c); -} - -#endif // __wasm_unimplemented_simd128__ - static __inline__ v128_t __DEFAULT_FN_ATTRS wasm_f64x2_add(v128_t __a, v128_t __b) { return (v128_t)((__f64x2)__a + (__f64x2)__b); -- GitLab From fa26da0582a4d5d922379db1d9fae87416b650d6 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Thu, 18 Mar 2021 17:33:12 -0700 Subject: [PATCH 0102/1000] Add a couple of missing attribute query methods [NFC] --- llvm/include/llvm/IR/Argument.h | 3 +++ llvm/include/llvm/IR/Function.h | 8 ++++++++ llvm/lib/IR/Function.cpp | 5 +++++ 3 files changed, 16 insertions(+) diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h index 76d780485ea0..e8ca8a6e81b9 100644 --- a/llvm/include/llvm/IR/Argument.h +++ b/llvm/include/llvm/IR/Argument.h @@ -120,6 +120,9 @@ public: /// Return true if this argument has the nocapture attribute. bool hasNoCaptureAttr() const; + /// Return true if this argument has the nofree attribute. + bool hasNoFreeAttr() const; + /// Return true if this argument has the sret attribute. bool hasStructRetAttr() const; diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index b1ef3b113190..b3a1b6c03618 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -624,6 +624,14 @@ public: addFnAttr(Attribute::NoFree); } + /// Determine if the call can synchroize with other threads + bool hasNoSync() const { + return hasFnAttribute(Attribute::NoSync); + } + void setNoSync() { + addFnAttr(Attribute::NoSync); + } + /// Determine if the function is known not to recurse, directly or /// indirectly. bool doesNotRecurse() const { diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 553db4e8f3f1..46aec7294572 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -240,6 +240,11 @@ bool Argument::hasNoCaptureAttr() const { return hasAttribute(Attribute::NoCapture); } +bool Argument::hasNoFreeAttr() const { + if (!getType()->isPointerTy()) return false; + return hasAttribute(Attribute::NoFree); +} + bool Argument::hasStructRetAttr() const { if (!getType()->isPointerTy()) return false; return hasAttribute(Attribute::StructRet); -- GitLab From 71c4da83b67a485f0cfacbce8b46eaa497df900e Mon Sep 17 00:00:00 2001 From: Jim Ingham Date: Thu, 18 Mar 2021 17:44:17 -0700 Subject: [PATCH 0103/1000] Don't assume that stepping out of a function will land on the next line. For instance, some recent clang emits this code on x86_64: 0x100002b99 <+57>: callq 0x100002b40 ; step_out_of_here at main.cpp:11 -> 0x100002b9e <+62>: xorl %eax, %eax 0x100002ba0 <+64>: popq %rbp 0x100002ba1 <+65>: retq and the "xorl %eax, %eax" is attributed to the same line as the callq. Since step out is supposed to stop just on returning from the function, you can't guarantee it will end up on the next line. I changed the test to check that we were either on the call line or on the next line, since either would be right depending on the debug information. --- .../thread/step_out/TestThreadStepOut.py | 78 ++++++++++--------- .../functionalities/thread/step_out/main.cpp | 2 +- 2 files changed, 41 insertions(+), 39 deletions(-) diff --git a/lldb/test/API/functionalities/thread/step_out/TestThreadStepOut.py b/lldb/test/API/functionalities/thread/step_out/TestThreadStepOut.py index 2ab36b57eaee..5b34e74b410d 100644 --- a/lldb/test/API/functionalities/thread/step_out/TestThreadStepOut.py +++ b/lldb/test/API/functionalities/thread/step_out/TestThreadStepOut.py @@ -62,50 +62,58 @@ class ThreadStepOutTestCase(TestBase): """Test thread step out on one thread via Python API (dwarf).""" self.build(dictionary=self.getBuildFlags()) self.step_out_test(self.step_out_with_python) - + def setUp(self): # Call super's setUp(). TestBase.setUp(self) # Find the line number for our breakpoint. self.bkpt_string = '// Set breakpoint here' self.breakpoint = line_number('main.cpp', self.bkpt_string) - - self.step_out_destination = line_number( - 'main.cpp', '// Expect to stop here after step-out.') - + self.step_in_line = line_number('main.cpp', '// But we might still be here') + self.step_out_dest = line_number('main.cpp', '// Expect to stop here after step-out.') + + def check_stepping_thread(self): + zeroth_frame = self.step_out_thread.frames[0] + line_entry = zeroth_frame.line_entry + self.assertTrue(line_entry.IsValid(), "Stopped at a valid line entry") + self.assertEqual("main.cpp", line_entry.file.basename, "Still in main.cpp") + # We can't really tell whether we stay on our line + # or get to the next line, it depends on whether there are any + # instructions between the call and the return. + line = line_entry.line + self.assertTrue(line == self.step_out_dest or line == self.step_in_line, "Stepped to the wrong line: {0}".format(line)) + def step_out_single_thread_with_cmd(self): + other_threads = {} + for thread in self.process.threads: + if thread.GetIndexID() == self.step_out_thread.GetIndexID(): + continue + other_threads[thread.GetIndexID()] = thread.frames[0].line_entry + + # There should be other threads... + self.assertNotEqual(len(other_threads), 0) self.step_out_with_cmd("this-thread") - self.expect( - "thread backtrace all", - "Thread location after step out is correct", - substrs=[ - "main.cpp:%d" % - self.step_out_destination, - "main.cpp:%d" % - self.breakpoint]) + # The other threads should not have made progress: + for thread in self.process.threads: + index_id = thread.GetIndexID() + line_entry = other_threads.get(index_id) + if line_entry: + self.assertEqual(thread.frames[0].line_entry.file.basename, line_entry.file.basename, "Thread {0} moved by file".format(index_id)) + self.assertEqual(thread.frames[0].line_entry.line, line_entry.line, "Thread {0} moved by line".format(index_id)) def step_out_all_threads_with_cmd(self): self.step_out_with_cmd("all-threads") - self.expect( - "thread backtrace all", - "Thread location after step out is correct", - substrs=[ - "main.cpp:%d" % - self.step_out_destination]) - + def step_out_with_cmd(self, run_mode): self.runCmd("thread select %d" % self.step_out_thread.GetIndexID()) self.runCmd("thread step-out -m %s" % run_mode) self.expect("process status", "Expected stop reason to be step-out", substrs=["stop reason = step out"]) - self.expect( - "thread list", - "Selected thread did not change during step-out", - substrs=[ - "* thread #%d" % - self.step_out_thread.GetIndexID()]) - + selected_thread = self.process.GetSelectedThread() + self.assertEqual(selected_thread.GetIndexID(), self.step_out_thread.GetIndexID(), "Step out changed selected thread.") + self.check_stepping_thread() + def step_out_with_python(self): self.step_out_thread.StepOut() @@ -115,18 +123,12 @@ class ThreadStepOutTestCase(TestBase): reason, "Expected thread stop reason 'plancomplete', but got '%s'" % lldbutil.stop_reason_to_str(reason)) - - # Verify location after stepping out - frame = self.step_out_thread.GetFrameAtIndex(0) - desc = lldbutil.get_description(frame.GetLineEntry()) - expect = "main.cpp:%d" % self.step_out_destination - self.assertTrue( - expect in desc, "Expected %s but thread stopped at %s" % - (expect, desc)) + self.check_stepping_thread() + def step_out_test(self, step_out_func): """Test single thread step out of a function.""" - (self.inferior_target, self.inferior_process, thread, bkpt) = lldbutil.run_to_source_breakpoint( + (self.inferior_target, self.process, thread, bkpt) = lldbutil.run_to_source_breakpoint( self, self.bkpt_string, lldb.SBFileSpec('main.cpp'), only_one_thread = False) # We hit the breakpoint on at least one thread. If we hit it on both threads @@ -135,13 +137,13 @@ class ThreadStepOutTestCase(TestBase): # the breakpoint: (breakpoint_threads, other_threads) = ([], []) - lldbutil.sort_stopped_threads(self.inferior_process, + lldbutil.sort_stopped_threads(self.process, breakpoint_threads=breakpoint_threads, other_threads=other_threads) if len(breakpoint_threads) == 1: success = thread.Suspend() self.assertTrue(success, "Couldn't suspend a thread") - bkpt_threads = lldbutil.continue_to_breakpoint(self.inferior_process, + bkpt_threads = lldbutil.continue_to_breakpoint(self.process, bkpt) self.assertEqual(len(bkpt_threads), 1, "Second thread stopped") success = thread.Resume() diff --git a/lldb/test/API/functionalities/thread/step_out/main.cpp b/lldb/test/API/functionalities/thread/step_out/main.cpp index e7dd230d239c..824f1b6c912d 100644 --- a/lldb/test/API/functionalities/thread/step_out/main.cpp +++ b/lldb/test/API/functionalities/thread/step_out/main.cpp @@ -19,7 +19,7 @@ thread_func () pseudo_barrier_wait(g_barrier); // Do something - step_out_of_here(); + step_out_of_here(); // But we might still be here // Return return NULL; // Expect to stop here after step-out. -- GitLab From 5c689e4bb0473e08645547ddbf9874b5e2fa04d0 Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Thu, 18 Mar 2021 19:58:21 -0700 Subject: [PATCH 0104/1000] Improve documentation for the [[clang::lifetimebound]] attribute. --- clang/include/clang/Basic/AttrDocs.td | 37 ++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index f73fbd08e3bf..734cf026ae87 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3032,10 +3032,39 @@ is retained by the return value of the annotated function (or, for a parameter of a constructor, in the value of the constructed object). It is only supported in C++. -This attribute provides an experimental implementation of the facility -described in the C++ committee paper `P0936R0 `_, -and is subject to change as the design of the corresponding functionality -changes. +This attribute causes warnings to be produced if a temporary object does not +live long enough. For example: + +.. code-block:: c++ + + template + const U &get_or_default(std::map &m, const T &key, + const U &default_value [[clang::lifetimebound]]); + + std::map m; + // warning: temporary "bar"s that might be bound to local reference 'val' + // will be destroyed at the end of the full-expression + const std::string &val = get_or_default(m, "foo"s, "bar"s); + +When applied to a reference parameter, the referenced object is assumed to be +retained by the return value of the function. When applied to a non-reference +parameter (for example, a pointer or a class type), all temporaries referenced +by the parameter are assumed to be retained by the return value of the +function. + +The attribute can be applied to the implicit ``this`` parameter of a member +function by writing the attribute after the function type: + +.. code-block:: c++ + + struct string_view { + // ... + const char *data() const [[clang::lifetimebound]]; + }; + +This attribute is inspired by the C++ committee paper `P0936R0 +`_, but does not affect whether temporary objects +have their lifetimes extended. }]; } -- GitLab From d8ab7ad317305d80e405ffdb4f33983f743a6ca2 Mon Sep 17 00:00:00 2001 From: Richard Smith Date: Thu, 18 Mar 2021 20:06:11 -0700 Subject: [PATCH 0105/1000] Fix example in documentation. --- clang/include/clang/Basic/AttrDocs.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index 734cf026ae87..7f30c6300e91 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3057,8 +3057,8 @@ function by writing the attribute after the function type: .. code-block:: c++ - struct string_view { - // ... + struct string { + // The returned pointer should not outlive ``*this``. const char *data() const [[clang::lifetimebound]]; }; -- GitLab From fff1363ba0ae50da3f8f7b732c90e47e504f18a9 Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Fri, 19 Mar 2021 11:29:48 +0700 Subject: [PATCH 0106/1000] [SCEV] Add false->any implication By definition of Implication operator, `false -> true` and `false -> false`. It means that `false` implies any predicate, no matter true or false. We don't need to go any further trying to prove the statement we need and just always say that `false` implies it in this case. In practice it means that we are trying to prove something guarded by `false` condition, which means that this code is unreachable, and we can safely prove any fact or perform any transform in this code. Differential Revision: https://reviews.llvm.org/D98706 Reviewed By: lebedev.ri --- llvm/lib/Analysis/ScalarEvolution.cpp | 5 +++++ .../max-backedge-taken-count-guard-info.ll | 10 +++++----- .../IndVarSimplify/2011-10-27-lftrnull.ll | 5 +---- .../Transforms/IndVarSimplify/X86/pr35406.ll | 20 +++++++------------ .../IndVarSimplify/trivial-guard.ll | 12 +++-------- 5 files changed, 21 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index ecf003319cd2..7dd05d0751f1 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -10137,6 +10137,11 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, const Value *FoundCondValue, bool Inverse, const Instruction *Context) { + // False conditions implies anything. Do not bother analyzing it further. + if (FoundCondValue == + ConstantInt::getBool(FoundCondValue->getContext(), Inverse)) + return true; + if (!PendingLoopPredicates.insert(FoundCondValue).second) return false; diff --git a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll index 98b4dc333c0a..28723ed97e40 100644 --- a/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll +++ b/llvm/test/Analysis/ScalarEvolution/max-backedge-taken-count-guard-info.ll @@ -531,16 +531,16 @@ define void @crash(i8* %ptr) { ; CHECK-LABEL: 'crash' ; CHECK-NEXT: Classifying expressions for: @crash ; CHECK-NEXT: %text.addr.5 = phi i8* [ %incdec.ptr112, %while.cond111 ], [ null, %while.body ] -; CHECK-NEXT: --> {null,+,-1}<%while.cond111> U: full-set S: full-set Exits: <> LoopDispositions: { %while.cond111: Computable, %while.body: Variant } +; CHECK-NEXT: --> {null,+,-1}<%while.cond111> U: full-set S: full-set Exits: <> LoopDispositions: { %while.cond111: Computable, %while.body: Variant } ; CHECK-NEXT: %incdec.ptr112 = getelementptr inbounds i8, i8* %text.addr.5, i64 -1 -; CHECK-NEXT: --> {(-1 + null),+,-1}<%while.cond111> U: full-set S: full-set Exits: <> LoopDispositions: { %while.cond111: Computable, %while.body: Variant } +; CHECK-NEXT: --> {(-1 + null),+,-1}<%while.cond111> U: full-set S: full-set Exits: <> LoopDispositions: { %while.cond111: Computable, %while.body: Variant } ; CHECK-NEXT: %lastout.2271 = phi i8* [ %incdec.ptr126, %while.body125 ], [ %ptr, %while.end117 ] -; CHECK-NEXT: --> {%ptr,+,1}<%while.body125> U: full-set S: full-set Exits: {(-2 + null),+,-1}<%while.cond111> LoopDispositions: { %while.body125: Computable } +; CHECK-NEXT: --> {%ptr,+,1}<%while.body125> U: full-set S: full-set Exits: {(-2 + null),+,-1}<%while.cond111> LoopDispositions: { %while.body125: Computable } ; CHECK-NEXT: %incdec.ptr126 = getelementptr inbounds i8, i8* %lastout.2271, i64 1 -; CHECK-NEXT: --> {(1 + %ptr),+,1}<%while.body125> U: [1,0) S: [1,0) Exits: {(-1 + null),+,-1}<%while.cond111> LoopDispositions: { %while.body125: Computable } +; CHECK-NEXT: --> {(1 + %ptr),+,1}<%while.body125> U: [1,0) S: [1,0) Exits: {(-1 + null),+,-1}<%while.cond111> LoopDispositions: { %while.body125: Computable } ; CHECK-NEXT: Determining loop execution counts for: @crash ; CHECK-NEXT: Loop %while.body125: backedge-taken count is {(-2 + (-1 * %ptr) + null),+,-1}<%while.cond111> -; CHECK-NEXT: Loop %while.body125: max backedge-taken count is -1 +; CHECK-NEXT: Loop %while.body125: max backedge-taken count is -2 ; CHECK-NEXT: Loop %while.body125: Predicated backedge-taken count is {(-2 + (-1 * %ptr) + null),+,-1}<%while.cond111> ; CHECK-NEXT: Predicates: ; CHECK: Loop %while.body125: Trip multiple is 1 diff --git a/llvm/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll b/llvm/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll index d56e985ce993..ed2b8743a79d 100644 --- a/llvm/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll +++ b/llvm/test/Transforms/IndVarSimplify/2011-10-27-lftrnull.ll @@ -28,16 +28,13 @@ define void @test() nounwind { ; CHECK-NEXT: br label [[FOR_BODY21_I:%.*]] ; CHECK: for.body21.i: ; CHECK-NEXT: [[DESTYPIXELPTR_010_I:%.*]] = phi i8* [ null, [[FOR_BODY21_LR_PH_I]] ], [ [[INCDEC_PTR_I:%.*]], [[IF_END_I126:%.*]] ] -; CHECK-NEXT: [[X_09_I:%.*]] = phi i32 [ 0, [[FOR_BODY21_LR_PH_I]] ], [ [[INC_I125:%.*]], [[IF_END_I126]] ] ; CHECK-NEXT: br i1 undef, label [[IF_END_I126]], label [[IF_ELSE_I124:%.*]] ; CHECK: if.else.i124: ; CHECK-NEXT: store i8 undef, i8* [[DESTYPIXELPTR_010_I]], align 1 ; CHECK-NEXT: br label [[IF_END_I126]] ; CHECK: if.end.i126: ; CHECK-NEXT: [[INCDEC_PTR_I]] = getelementptr inbounds i8, i8* [[DESTYPIXELPTR_010_I]], i32 1 -; CHECK-NEXT: [[INC_I125]] = add nuw i32 [[X_09_I]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC_I125]], undef -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY21_I]], label [[FOR_END_I129_LOOPEXIT:%.*]] +; CHECK-NEXT: br i1 true, label [[FOR_BODY21_I]], label [[FOR_END_I129_LOOPEXIT:%.*]] ; CHECK: for.end.i129.loopexit: ; CHECK-NEXT: br label [[FOR_END_I129]] ; CHECK: for.end.i129: diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll index e51ee24cd343..6d7bbced417d 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll @@ -9,8 +9,8 @@ define i32 @testDiv(i8* %p, i64* %p1) { ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop1: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP2_EXIT:%.*]] ], [ 8, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[EXITCOND3:%.*]] = icmp eq i64 [[INDVARS_IV]], 15 -; CHECK-NEXT: br i1 [[EXITCOND3]], label [[EXIT:%.*]], label [[GENERAL_CASE24:%.*]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 15 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[GENERAL_CASE24:%.*]] ; CHECK: general_case24: ; CHECK-NEXT: br i1 false, label [[LOOP2_PREHEADER:%.*]], label [[LOOP2_EXIT]] ; CHECK: loop2.preheader: @@ -19,14 +19,11 @@ define i32 @testDiv(i8* %p, i64* %p1) { ; CHECK-NEXT: br label [[LOOP2:%.*]] ; CHECK: loop2: ; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[TMP1]], [[LOOP2_PREHEADER]] ], [ [[INDVARS_IV_NEXT2:%.*]], [[LOOP2]] ] -; CHECK-NEXT: [[LOCAL_2_57:%.*]] = phi i32 [ [[I7:%.*]], [[LOOP2]] ], [ 1, [[LOOP2_PREHEADER]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT2]] = add nsw i64 [[INDVARS_IV1]], -1 +; CHECK-NEXT: [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], -1 ; CHECK-NEXT: [[I4:%.*]] = load atomic i64, i64* [[P1:%.*]] unordered, align 8 ; CHECK-NEXT: [[I6:%.*]] = sub i64 [[I4]], [[INDVARS_IV_NEXT2]] ; CHECK-NEXT: store atomic i64 [[I6]], i64* [[P1]] unordered, align 8 -; CHECK-NEXT: [[I7]] = add nuw nsw i32 [[LOCAL_2_57]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I7]], 9 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] +; CHECK-NEXT: br i1 false, label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] ; CHECK: loop2.exit.loopexit: ; CHECK-NEXT: br label [[LOOP2_EXIT]] ; CHECK: loop2.exit: @@ -79,8 +76,8 @@ define i32 @testRem(i8* %p, i64* %p1) { ; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop1: ; CHECK-NEXT: [[LOCAL_0_:%.*]] = phi i32 [ 8, [[ENTRY:%.*]] ], [ [[I9:%.*]], [[LOOP2_EXIT:%.*]] ] -; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i32 [[LOCAL_0_]], 15 -; CHECK-NEXT: br i1 [[EXITCOND1]], label [[EXIT:%.*]], label [[GENERAL_CASE24:%.*]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LOCAL_0_]], 15 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[GENERAL_CASE24:%.*]] ; CHECK: general_case24: ; CHECK-NEXT: br i1 false, label [[LOOP2_PREHEADER:%.*]], label [[LOOP2_EXIT]] ; CHECK: loop2.preheader: @@ -93,14 +90,11 @@ define i32 @testRem(i8* %p, i64* %p1) { ; CHECK-NEXT: br label [[LOOP2:%.*]] ; CHECK: loop2: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP5]], [[LOOP2_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[LOOP2]] ] -; CHECK-NEXT: [[LOCAL_2_57:%.*]] = phi i32 [ [[I7:%.*]], [[LOOP2]] ], [ 1, [[LOOP2_PREHEADER]] ] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; CHECK-NEXT: [[I4:%.*]] = load atomic i64, i64* [[P1:%.*]] unordered, align 8 ; CHECK-NEXT: [[I6:%.*]] = sub i64 [[I4]], [[INDVARS_IV_NEXT]] ; CHECK-NEXT: store atomic i64 [[I6]], i64* [[P1]] unordered, align 8 -; CHECK-NEXT: [[I7]] = add nuw nsw i32 [[LOCAL_2_57]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I7]], 9 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] +; CHECK-NEXT: br i1 false, label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] ; CHECK: loop2.exit.loopexit: ; CHECK-NEXT: br label [[LOOP2_EXIT]] ; CHECK: loop2.exit: diff --git a/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll b/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll index 7506259aa7a3..60a0c2a8526c 100644 --- a/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll +++ b/llvm/test/Transforms/IndVarSimplify/trivial-guard.ll @@ -21,11 +21,8 @@ define void @test_01(i32 %x) { ; CHECK-NEXT: [[LOOP_COND_1:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[LOOP_COND_1]], label [[LOOP_1]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: loop.2: -; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[IV_NEXT_2:%.*]], [[GUARDED_2:%.*]] ], [ 0, [[LOOP_2_PREHEADER]] ] -; CHECK-NEXT: [[CHECK_2:%.*]] = icmp slt i32 [[IV_2]], [[X]] -; CHECK-NEXT: br i1 [[CHECK_2]], label [[GUARDED_2]], label [[FAIL_LOOPEXIT1:%.*]] +; CHECK-NEXT: br i1 true, label [[GUARDED_2:%.*]], label [[FAIL_LOOPEXIT1:%.*]] ; CHECK: guarded.2: -; CHECK-NEXT: [[IV_NEXT_2]] = add nuw i32 [[IV_2]], 1 ; CHECK-NEXT: [[LOOP_COND_2:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[LOOP_COND_2]], label [[LOOP_2]], label [[EXIT_LOOPEXIT2:%.*]] ; CHECK: exit.loopexit: @@ -80,16 +77,13 @@ define void @test_02(i32 %x) { ; CHECK: loop.1.preheader: ; CHECK-NEXT: br label [[LOOP_1:%.*]] ; CHECK: loop.1: -; CHECK-NEXT: [[IV_1:%.*]] = phi i32 [ [[IV_NEXT_1:%.*]], [[GUARDED_1:%.*]] ], [ 0, [[LOOP_1_PREHEADER]] ] -; CHECK-NEXT: [[CHECK_1:%.*]] = icmp slt i32 [[IV_1]], [[X:%.*]] -; CHECK-NEXT: br i1 [[CHECK_1]], label [[GUARDED_1]], label [[FAIL_LOOPEXIT:%.*]] +; CHECK-NEXT: br i1 true, label [[GUARDED_1:%.*]], label [[FAIL_LOOPEXIT:%.*]] ; CHECK: guarded.1: -; CHECK-NEXT: [[IV_NEXT_1]] = add nuw i32 [[IV_1]], 1 ; CHECK-NEXT: [[LOOP_COND_1:%.*]] = call i1 @cond() ; CHECK-NEXT: br i1 [[LOOP_COND_1]], label [[LOOP_1]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: loop.2: ; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[IV_NEXT_2:%.*]], [[GUARDED_2:%.*]] ], [ 0, [[LOOP_2_PREHEADER]] ] -; CHECK-NEXT: [[CHECK_2:%.*]] = icmp slt i32 [[IV_2]], [[X]] +; CHECK-NEXT: [[CHECK_2:%.*]] = icmp slt i32 [[IV_2]], [[X:%.*]] ; CHECK-NEXT: br i1 [[CHECK_2]], label [[GUARDED_2]], label [[FAIL_LOOPEXIT1:%.*]] ; CHECK: guarded.2: ; CHECK-NEXT: [[IV_NEXT_2]] = add nuw i32 [[IV_2]], 1 -- GitLab From 1410db70b98d26e9a354373f02d4e4c407468933 Mon Sep 17 00:00:00 2001 From: Wenlei He Date: Fri, 19 Feb 2021 22:46:30 -0800 Subject: [PATCH 0107/1000] [CSSPGO] Add attribute metadata for context profile This changes adds attribute field for metadata of context profile. Currently we have an inline attribute that indicates whether the leaf frame corresponding to a context profile was inlined in previous build. This will be used to help estimating inlining and be taken into account when trimming context. Changes for that in llvm-profgen will follow. It will also help tuning. Differential Revision: https://reviews.llvm.org/D98823 --- llvm/include/llvm/ProfileData/SampleProf.h | 20 ++++- .../llvm/ProfileData/SampleProfReader.h | 5 +- llvm/lib/ProfileData/SampleProfReader.cpp | 74 +++++++++++++------ llvm/lib/ProfileData/SampleProfWriter.cpp | 13 +++- .../llvm-profdata/Inputs/cs-sample.proftext | 8 ++ .../Inputs/pseudo-probe-profile.proftext | 1 + .../llvm-profgen/inline-cs-pseudoprobe.test | 2 +- .../llvm-profgen/merge-cold-profile.test | 2 + .../llvm-profgen/noinline-cs-pseudoprobe.test | 2 +- llvm/tools/llvm-profgen/PerfReader.cpp | 3 +- llvm/tools/llvm-profgen/PerfReader.h | 3 +- llvm/tools/llvm-profgen/ProfileGenerator.cpp | 22 +++--- llvm/tools/llvm-profgen/ProfileGenerator.h | 6 +- llvm/tools/llvm-profgen/ProfiledBinary.cpp | 8 +- llvm/tools/llvm-profgen/ProfiledBinary.h | 4 +- 15 files changed, 123 insertions(+), 50 deletions(-) diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h index 70fdaff38504..8b590e84dd9b 100644 --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -196,6 +196,7 @@ enum class SecProfSummaryFlags : uint32_t { enum class SecFuncMetadataFlags : uint32_t { SecFlagInvalid = 0, SecFlagIsProbeBased = (1 << 0), + SecFlagHasAttribute = (1 << 1) }; // Verify section specific flag is used for the correct section. @@ -385,6 +386,13 @@ enum ContextStateMask { MergedContext = 0x8 // Profile for context merged into base profile }; +// Attribute of context associated with FunctionSamples +enum ContextAttributeMask { + ContextNone = 0x0, + ContextWasInlined = 0x1, // Leaf of context was inlined in previous build + ContextShouldBeInlined = 0x2, // Leaf of context should be inlined +}; + // Sample context for FunctionSamples. It consists of the calling context, // the function name and context state. Internally sample context is represented // using StringRef, which is also the input for constructing a `SampleContext`. @@ -396,9 +404,9 @@ enum ContextStateMask { // `_Z8funcLeafi` class SampleContext { public: - SampleContext() : State(UnknownContext) {} - SampleContext(StringRef ContextStr, - ContextStateMask CState = UnknownContext) { + SampleContext() : State(UnknownContext), Attributes(ContextNone) {} + SampleContext(StringRef ContextStr, ContextStateMask CState = UnknownContext) + : Attributes(ContextNone) { setContext(ContextStr, CState); } @@ -443,6 +451,10 @@ public: } operator StringRef() const { return FullContext; } + bool hasAttribute(ContextAttributeMask A) { return Attributes & (uint32_t)A; } + void setAttribute(ContextAttributeMask A) { Attributes |= (uint32_t)A; } + uint32_t getAllAttributes() { return Attributes; } + void setAllAttributes(uint32_t A) { Attributes = A; } bool hasState(ContextStateMask S) { return State & (uint32_t)S; } void setState(ContextStateMask S) { State |= (uint32_t)S; } void clearState(ContextStateMask S) { State &= (uint32_t)~S; } @@ -503,6 +515,8 @@ private: StringRef CallingContext; // State of the associated sample profile uint32_t State; + // Attribute of the associated sample profile + uint32_t Attributes; }; class FunctionSamples; diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h index db1ec6869724..8203a1b8fb3b 100644 --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -28,6 +28,7 @@ // offsetA1[.discriminator]: number_of_samples [fn7:num fn8:num ... ] // ... // !CFGChecksum: num +// !Attribute: flags // // This is a nested tree in which the indentation represents the nesting level // of the inline stack. There are no blank lines in the file. And the spacing @@ -127,6 +128,8 @@ // // a. CFG Checksum (a.k.a. function hash): // !CFGChecksum: 12345 +// b. CFG Checksum (see ContextAttributeMask): +// !Atribute: 1 // // // Binary format @@ -647,7 +650,7 @@ protected: std::error_code readSecHdrTableEntry(uint32_t Idx); std::error_code readSecHdrTable(); - std::error_code readFuncMetadata(); + std::error_code readFuncMetadata(bool ProfileHasAttribute); std::error_code readFuncOffsetTable(); std::error_code readFuncProfiles(); std::error_code readMD5NameTable(); diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp index 697d29f6f412..200a0afb01c6 100644 --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -88,13 +88,22 @@ static bool isOffsetLegal(unsigned L) { return (L & 0xffff) == L; } /// Possible metadata: /// - CFG Checksum information: /// !CFGChecksum: 12345 +/// - CFG Checksum information: +/// !Attributes: 1 /// Stores the FunctionHash (a.k.a. CFG Checksum) into \p FunctionHash. -static bool parseMetadata(const StringRef &Input, uint64_t &FunctionHash) { - if (!Input.startswith("!CFGChecksum:")) - return false; +static bool parseMetadata(const StringRef &Input, uint64_t &FunctionHash, + uint32_t &Attributes) { + if (Input.startswith("!CFGChecksum:")) { + StringRef CFGInfo = Input.substr(strlen("!CFGChecksum:")).trim(); + return !CFGInfo.getAsInteger(10, FunctionHash); + } + + if (Input.startswith("!Attributes:")) { + StringRef Attrib = Input.substr(strlen("!Attributes:")).trim(); + return !Attrib.getAsInteger(10, Attributes); + } - StringRef CFGInfo = Input.substr(strlen("!CFGChecksum:")).trim(); - return !CFGInfo.getAsInteger(10, FunctionHash); + return false; } enum class LineType { @@ -119,7 +128,7 @@ static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth, uint64_t &NumSamples, uint32_t &LineOffset, uint32_t &Discriminator, StringRef &CalleeName, DenseMap &TargetCountMap, - uint64_t &FunctionHash) { + uint64_t &FunctionHash, uint32_t &Attributes) { for (Depth = 0; Input[Depth] == ' '; Depth++) ; if (Depth == 0) @@ -127,7 +136,7 @@ static bool ParseLine(const StringRef &Input, LineType &LineTy, uint32_t &Depth, if (Depth == 1 && Input[Depth] == '!') { LineTy = LineType::Metadata; - return parseMetadata(Input.substr(Depth), FunctionHash); + return parseMetadata(Input.substr(Depth), FunctionHash, Attributes); } size_t n1 = Input.find(':'); @@ -270,9 +279,11 @@ std::error_code SampleProfileReaderText::readImpl() { DenseMap TargetCountMap; uint32_t Depth, LineOffset, Discriminator; LineType LineTy; - uint64_t FunctionHash; + uint64_t FunctionHash = 0; + uint32_t Attributes = 0; if (!ParseLine(*LineIt, LineTy, Depth, NumSamples, LineOffset, - Discriminator, FName, TargetCountMap, FunctionHash)) { + Discriminator, FName, TargetCountMap, FunctionHash, + Attributes)) { reportError(LineIt.line_number(), "Expected 'NUM[.NUM]: NUM[ mangled_name:NUM]*', found " + *LineIt); @@ -312,8 +323,12 @@ std::error_code SampleProfileReaderText::readImpl() { } case LineType::Metadata: { FunctionSamples &FProfile = *InlineStack.back(); - FProfile.setFunctionHash(FunctionHash); - ++ProbeProfileCount; + if (FunctionHash) { + FProfile.setFunctionHash(FunctionHash); + ++ProbeProfileCount; + } + if (Attributes) + FProfile.getContext().setAllAttributes(Attributes); SeenMetadata = true; break; } @@ -601,13 +616,16 @@ std::error_code SampleProfileReaderExtBinaryBase::readOneSection( if (std::error_code EC = readFuncOffsetTable()) return EC; break; - case SecFuncMetadata: + case SecFuncMetadata: { ProfileIsProbeBased = hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagIsProbeBased); FunctionSamples::ProfileIsProbeBased = ProfileIsProbeBased; - if (std::error_code EC = readFuncMetadata()) + bool HasAttribute = + hasSecFlag(Entry, SecFuncMetadataFlags::SecFlagHasAttribute); + if (std::error_code EC = readFuncMetadata(HasAttribute)) return EC; break; + } case SecProfileSymbolList: if (std::error_code EC = readProfileSymbolList()) return EC; @@ -941,23 +959,31 @@ std::error_code SampleProfileReaderExtBinaryBase::readNameTableSec(bool IsMD5) { return SampleProfileReaderBinary::readNameTable(); } -std::error_code SampleProfileReaderExtBinaryBase::readFuncMetadata() { - if (!ProfileIsProbeBased) - return sampleprof_error::success; +std::error_code +SampleProfileReaderExtBinaryBase::readFuncMetadata(bool ProfileHasAttribute) { while (Data < End) { auto FName(readStringFromTable()); if (std::error_code EC = FName.getError()) return EC; - auto Checksum = readNumber(); - if (std::error_code EC = Checksum.getError()) - return EC; - SampleContext FContext(*FName); - // No need to load metadata for profiles that are not loaded in the current - // module. - if (Profiles.count(FContext)) - Profiles[FContext].setFunctionHash(*Checksum); + bool ProfileInMap = Profiles.count(FContext); + + if (ProfileIsProbeBased) { + auto Checksum = readNumber(); + if (std::error_code EC = Checksum.getError()) + return EC; + if (ProfileInMap) + Profiles[FContext].setFunctionHash(*Checksum); + } + + if (ProfileHasAttribute) { + auto Attributes = readNumber(); + if (std::error_code EC = Attributes.getError()) + return EC; + if (ProfileInMap) + Profiles[FContext].getContext().setAllAttributes(*Attributes); + } } assert(Data == End && "More data is read than expected"); diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp index 7a00c3fec7c7..b9643480a8e4 100644 --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -170,12 +170,15 @@ std::error_code SampleProfileWriterExtBinaryBase::writeFuncOffsetTable() { std::error_code SampleProfileWriterExtBinaryBase::writeFuncMetadata( const StringMap &Profiles) { - if (!FunctionSamples::ProfileIsProbeBased) + if (!FunctionSamples::ProfileIsProbeBased && !FunctionSamples::ProfileIsCS) return sampleprof_error::success; auto &OS = *OutputStream; for (const auto &Entry : Profiles) { writeNameIdx(Entry.first()); - encodeULEB128(Entry.second.getFunctionHash(), OS); + if (FunctionSamples::ProfileIsProbeBased) + encodeULEB128(Entry.second.getFunctionHash(), OS); + if (FunctionSamples::ProfileIsCS) + encodeULEB128(Entry.second.getContext().getAllAttributes(), OS); } return sampleprof_error::success; } @@ -239,6 +242,8 @@ std::error_code SampleProfileWriterExtBinaryBase::writeOneSection( addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased); if (Type == SecProfSummary && FunctionSamples::ProfileIsCS) addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext); + if (Type == SecFuncMetadata && FunctionSamples::ProfileIsCS) + addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagHasAttribute); uint64_t SectionStart = markSectionStart(Type, LayoutIdx); switch (Type) { @@ -417,6 +422,10 @@ std::error_code SampleProfileWriterText::writeSample(const FunctionSamples &S) { OS.indent(Indent + 1); OS << "!CFGChecksum: " << S.getFunctionHash() << "\n"; } + if (FunctionSamples::ProfileIsCS) { + OS.indent(Indent + 1); + OS << "!Attributes: " << S.getContext().getAllAttributes() << "\n"; + } } return sampleprof_error::success; diff --git a/llvm/test/tools/llvm-profdata/Inputs/cs-sample.proftext b/llvm/test/tools/llvm-profdata/Inputs/cs-sample.proftext index eead4d4d62f0..e960dea02e6b 100644 --- a/llvm/test/tools/llvm-profdata/Inputs/cs-sample.proftext +++ b/llvm/test/tools/llvm-profdata/Inputs/cs-sample.proftext @@ -4,6 +4,7 @@ 3: 287884 4: 287864 _Z3fibi:315608 15: 23 + !Attributes: 0 [main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20 0: 15 1: 15 @@ -12,25 +13,32 @@ 10: 23324 11: 23327 _Z3fibi:25228 15: 11 + !Attributes: 1 [main]:154:0 2: 12 3: 18 _Z5funcAi:11 3.1: 18 _Z5funcBi:19 + !Attributes: 0 [external:12 @ main]:154:12 2: 12 3: 10 _Z5funcAi:7 3.1: 10 _Z5funcBi:11 + !Attributes: 0 [main:3.1 @ _Z5funcBi]:120:19 0: 19 1: 19 _Z8funcLeafi:20 3: 12 + !Attributes: 1 [externalA:17 @ _Z5funcBi]:120:3 0: 3 1: 3 + !Attributes: 0 [external:10 @ _Z5funcBi]:120:10 0: 10 1: 10 + !Attributes: 0 [main:3 @ _Z5funcAi]:99:11 0: 10 1: 10 _Z8funcLeafi:11 3: 24 + !Attributes: 0 diff --git a/llvm/test/tools/llvm-profdata/Inputs/pseudo-probe-profile.proftext b/llvm/test/tools/llvm-profdata/Inputs/pseudo-probe-profile.proftext index f4ae6d919747..82f57d6065f8 100644 --- a/llvm/test/tools/llvm-profdata/Inputs/pseudo-probe-profile.proftext +++ b/llvm/test/tools/llvm-profdata/Inputs/pseudo-probe-profile.proftext @@ -6,3 +6,4 @@ foo:3200:13 5: 7 _Z3foov:5 _Z3barv:2 6: 6 _Z3barv:4 _Z3foov:2 !CFGChecksum: 563022570642068 + !Attributes: 0 diff --git a/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test b/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test index cb414c2e6c06..5fc87475f505 100644 --- a/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test +++ b/llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test @@ -9,7 +9,7 @@ ; CHECK-NEXT: 6: 15 ; CHECK-NEXT: 8: 14 bar:14 ; CHECK-NEXT: !CFGChecksum: 138950591924 -; CHECK-NEXT:[main:2 @ foo:8 @ bar]:28:14 +; CHECK:[main:2 @ foo:8 @ bar]:28:14 ; CHECK-NEXT: 1: 14 ; CHECK-NEXT: 2: 18446744073709551615 ; CHECK-NEXT: 3: 18446744073709551615 diff --git a/llvm/test/tools/llvm-profgen/merge-cold-profile.test b/llvm/test/tools/llvm-profgen/merge-cold-profile.test index e0c65ac44e2b..43dc73e739ad 100644 --- a/llvm/test/tools/llvm-profgen/merge-cold-profile.test +++ b/llvm/test/tools/llvm-profgen/merge-cold-profile.test @@ -14,6 +14,7 @@ ; CHECK-NEXT: 7: 2 fb:2 ; CHECK-NEXT: 8: 1 fa:1 ; CHECK-NEXT: !CFGChecksum: 120515930909 +; CHECK-NEXT: !Attributes: 0 ; CHECK-NEXT:[main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4 ; CHECK-NEXT: 1: 4 ; CHECK-NEXT: 2: 3 @@ -29,6 +30,7 @@ ; CHECK-KEEP-COLD-NEXT: 5: 4 fb:4 ; CHECK-KEEP-COLD-NEXT: 6: 3 fa:3 ; CHECK-KEEP-COLD-NEXT: !CFGChecksum: 72617220756 +; CHECK-KEEP-COLD-NEXT: !Attributes: 0 ; CHECK-KEEP-COLD-NEXT:[fa]:14:4 ; CHECK-KEEP-COLD-NEXT: 1: 4 ; CHECK-KEEP-COLD-NEXT: 3: 4 diff --git a/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test b/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test index 64a8b052ab93..c4edb978bfca 100644 --- a/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test +++ b/llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test @@ -8,7 +8,7 @@ ; CHECK-NEXT: 6: 15 ; CHECK-NEXT: 8: 15 bar:15 ; CHECK-NEXT: !CFGChecksum: 138950591924 -; CHECK-NEXT:[main:2 @ foo:8 @ bar]:30:15 +; CHECK:[main:2 @ foo:8 @ bar]:30:15 ; CHECK-NEXT: 1: 15 ; CHECK-NEXT: 2: 18446744073709551615 ; CHECK-NEXT: 3: 18446744073709551615 diff --git a/llvm/tools/llvm-profgen/PerfReader.cpp b/llvm/tools/llvm-profgen/PerfReader.cpp index 1f842008db42..0d60fa3332b4 100644 --- a/llvm/tools/llvm-profgen/PerfReader.cpp +++ b/llvm/tools/llvm-profgen/PerfReader.cpp @@ -92,7 +92,8 @@ void VirtualUnwinder::unwindBranchWithinFrame(UnwindState &State) { std::shared_ptr FrameStack::getContextKey() { std::shared_ptr KeyStr = std::make_shared(); - KeyStr->Context = Binary->getExpandedContextStr(Stack); + KeyStr->Context = + Binary->getExpandedContextStr(Stack, KeyStr->WasLeafInlined); if (KeyStr->Context.empty()) return nullptr; KeyStr->genHashCode(); diff --git a/llvm/tools/llvm-profgen/PerfReader.h b/llvm/tools/llvm-profgen/PerfReader.h index b802c212eb46..a1d319226864 100644 --- a/llvm/tools/llvm-profgen/PerfReader.h +++ b/llvm/tools/llvm-profgen/PerfReader.h @@ -311,7 +311,8 @@ struct ContextKey { // String based context id struct StringBasedCtxKey : public ContextKey { std::string Context; - StringBasedCtxKey() : ContextKey(CK_StringBased){}; + bool WasLeafInlined; + StringBasedCtxKey() : ContextKey(CK_StringBased), WasLeafInlined(false){}; static bool classof(const ContextKey *K) { return K->getKind() == CK_StringBased; } diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp index 77416d2ff989..81b0c912884f 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -188,10 +188,13 @@ void ProfileGenerator::findDisjointRanges(RangeSample &DisjointRanges, } FunctionSamples & -CSProfileGenerator::getFunctionProfileForContext(StringRef ContextStr) { +CSProfileGenerator::getFunctionProfileForContext(StringRef ContextStr, + bool WasLeafInlined) { auto Ret = ProfileMap.try_emplace(ContextStr, FunctionSamples()); if (Ret.second) { SampleContext FContext(Ret.first->first(), RawContext); + if (WasLeafInlined) + FContext.setAttribute(ContextWasInlined); FunctionSamples &FProfile = Ret.first->second; FProfile.setContext(FContext); } @@ -208,7 +211,7 @@ void CSProfileGenerator::generateProfile() { StringRef ContextId(CtxKey->Context); // Get or create function profile for the range FunctionSamples &FunctionProfile = - getFunctionProfileForContext(ContextId); + getFunctionProfileForContext(ContextId, CtxKey->WasLeafInlined); // Fill in function body samples populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter, @@ -428,6 +431,7 @@ void CSProfileGenerator::write(std::unique_ptr Writer, assert(Ret.second && "Must be a unique context"); SampleContext FContext(Ret.first->first(), RawContext); FunctionSamples &FProfile = Ret.first->second; + FContext.setAllAttributes(FProfile.getContext().getAllAttributes()); FProfile.setName(FContext.getNameWithContext(true)); FProfile.setContext(FContext); } @@ -587,7 +591,7 @@ void PseudoProbeCSProfileGenerator::populateBoundarySamplesWithProbes( FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe( SmallVectorImpl &ContextStrStack, - const PseudoProbeFuncDesc *LeafFuncDesc) { + const PseudoProbeFuncDesc *LeafFuncDesc, bool WasLeafInlined) { assert(ContextStrStack.size() && "Profile context must have the leaf frame"); // Compress the context string except for the leaf frame std::string LeafFrame = ContextStrStack.back(); @@ -608,7 +612,7 @@ FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe( OContextStr << StringRef(LeafFrame).split(":").first.str(); FunctionSamples &FunctionProile = - getFunctionProfileForContext(OContextStr.str()); + getFunctionProfileForContext(OContextStr.str(), WasLeafInlined); FunctionProile.setFunctionHash(LeafFuncDesc->FuncHash); return FunctionProile; } @@ -619,13 +623,11 @@ FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe( // Explicitly copy the context for appending the leaf context SmallVector ContextStrStackCopy(ContextStrStack.begin(), ContextStrStack.end()); - Binary->getInlineContextForProbe(LeafProbe, ContextStrStackCopy); - // Note that the context from probe doesn't include leaf frame, - // hence we need to retrieve and append the leaf frame. + Binary->getInlineContextForProbe(LeafProbe, ContextStrStackCopy, true); const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->GUID); - ContextStrStackCopy.emplace_back(FuncDesc->FuncName + ":" + - Twine(LeafProbe->Index).str()); - return getFunctionProfileForLeafProbe(ContextStrStackCopy, FuncDesc); + bool WasLeafInlined = LeafProbe->InlineTree->hasInlineSite(); + return getFunctionProfileForLeafProbe(ContextStrStackCopy, FuncDesc, + WasLeafInlined); } } // end namespace sampleprof diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h index 4ea459e7dabb..2205f781e682 100644 --- a/llvm/tools/llvm-profgen/ProfileGenerator.h +++ b/llvm/tools/llvm-profgen/ProfileGenerator.h @@ -174,7 +174,8 @@ public: protected: // Lookup or create FunctionSamples for the context - FunctionSamples &getFunctionProfileForContext(StringRef ContextId); + FunctionSamples &getFunctionProfileForContext(StringRef ContextId, + bool WasLeafInlined = false); // Merge cold context profile whose total sample is below threshold // into base profile. void mergeAndTrimColdProfile(StringMap &ProfileMap); @@ -229,7 +230,8 @@ private: // Helper function to get FunctionSamples for the leaf inlined context FunctionSamples & getFunctionProfileForLeafProbe(SmallVectorImpl &ContextStrStack, - const PseudoProbeFuncDesc *LeafFuncDesc); + const PseudoProbeFuncDesc *LeafFuncDesc, + bool WasLeafInlined); // Helper function to get FunctionSamples for the leaf probe FunctionSamples & getFunctionProfileForLeafProbe(SmallVectorImpl &ContextStrStack, diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.cpp b/llvm/tools/llvm-profgen/ProfiledBinary.cpp index 2d6cbfe474fd..9063f06f5579 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.cpp +++ b/llvm/tools/llvm-profgen/ProfiledBinary.cpp @@ -131,8 +131,9 @@ bool ProfiledBinary::inlineContextEqual(uint64_t Address1, Context2.begin(), Context2.begin() + Context2.size() - 1); } -std::string ProfiledBinary::getExpandedContextStr( - const SmallVectorImpl &Stack) const { +std::string +ProfiledBinary::getExpandedContextStr(const SmallVectorImpl &Stack, + bool &WasLeafInlined) const { std::string ContextStr; SmallVector ContextVec; // Process from frame root to leaf @@ -143,6 +144,9 @@ std::string ProfiledBinary::getExpandedContextStr( // processing if (ExpandedContext.empty()) return std::string(); + // Set WasLeafInlined to the size of inlined frame count for the last + // address which is leaf + WasLeafInlined = (ExpandedContext.size() > 1); for (const auto &Loc : ExpandedContext) { ContextVec.push_back(getCallSite(Loc)); } diff --git a/llvm/tools/llvm-profgen/ProfiledBinary.h b/llvm/tools/llvm-profgen/ProfiledBinary.h index 7ceca1c5995c..b56574e0bf6f 100644 --- a/llvm/tools/llvm-profgen/ProfiledBinary.h +++ b/llvm/tools/llvm-profgen/ProfiledBinary.h @@ -239,8 +239,8 @@ public: // Get the context string of the current stack with inline context filled in. // It will search the disassembling info stored in Offset2LocStackMap. This is // used as the key of function sample map - std::string - getExpandedContextStr(const SmallVectorImpl &Stack) const; + std::string getExpandedContextStr(const SmallVectorImpl &Stack, + bool &WasLeafInlined) const; const PseudoProbe *getCallProbeForAddr(uint64_t Address) const { return ProbeDecoder.getCallProbeForAddr(Address); -- GitLab From fc1812a0ad757838b66aab57e1df720ec205a16a Mon Sep 17 00:00:00 2001 From: Hongtao Yu Date: Wed, 17 Mar 2021 11:17:17 -0700 Subject: [PATCH 0108/1000] [UniqueLinkageName] Use consistent checks when mangling symbo linkage name and debug linkage name. C functions may be declared and defined in different prototypes like below. This patch unifies the checks for mangling names in symbol linkage name emission and debug linkage name emission so that the two names are consistent. static int go(int); static int go(a) int a; { return a; } Test Plan: Differential Revision: https://reviews.llvm.org/D98799 --- clang/lib/AST/ItaniumMangle.cpp | 2 +- clang/lib/CodeGen/CGDebugInfo.cpp | 2 +- .../unique-internal-linkage-names-dwarf.c | 27 +++++++++++++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index ba96fda6cd57..3e6e29207f08 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -640,7 +640,7 @@ bool ItaniumMangleContextImpl::isUniqueInternalLinkageDecl( // For C functions without prototypes, return false as their // names should not be mangled. - if (!FD->getType()->getAs()) + if (!FD->hasPrototype()) return false; if (isInternalLinkageDecl(ND)) diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 468c2b78b488..c80249a9c9fc 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -3522,7 +3522,7 @@ void CGDebugInfo::collectFunctionDeclProps(GlobalDecl GD, llvm::DIFile *Unit, llvm::DIScope *&FDContext, llvm::DINodeArray &TParamsArray, llvm::DINode::DIFlags &Flags) { - const auto *FD = cast(GD.getDecl()); + const auto *FD = cast(GD.getCanonicalDecl().getDecl()); Name = getFunctionName(FD); // Use mangled name as linkage name for C/C++ functions. if (FD->hasPrototype()) { diff --git a/clang/test/CodeGen/unique-internal-linkage-names-dwarf.c b/clang/test/CodeGen/unique-internal-linkage-names-dwarf.c index a3583426de79..e5d507e154ae 100644 --- a/clang/test/CodeGen/unique-internal-linkage-names-dwarf.c +++ b/clang/test/CodeGen/unique-internal-linkage-names-dwarf.c @@ -8,21 +8,48 @@ // RUN: %clang_cc1 -triple x86_64-unknown-linux -debug-info-kind=limited -dwarf-version=5 -funique-internal-linkage-names -emit-llvm -o - %s | FileCheck %s --check-prefix=UNIQUE static int glob; +// foo should be given a uniquefied name under -funique-internal-linkage-names. static int foo(void) { return glob; } +// bar should not be given a uniquefied name under -funique-internal-linkage-names, +// since it doesn't come with valid prototype. +static int bar(a) int a; +{ + return glob + a; +} + +// go should be given a uniquefied name under -funique-internal-linkage-names, even +// if its definition doesn't come with a valid prototype, but the declaration here +// has a prototype. +static int go(int); + void baz() { foo(); + bar(1); + go(2); } +static int go(a) int a; +{ + return glob + a; +} + + // PLAIN: @glob = internal global i32 // PLAIN: define internal i32 @foo() +// PLAIN: define internal i32 @bar(i32 %a) // PLAIN: distinct !DIGlobalVariable(name: "glob"{{.*}}) // PLAIN: distinct !DISubprogram(name: "foo"{{.*}}) +// PLAIN: distinct !DISubprogram(name: "bar"{{.*}}) +// PLAIN: distinct !DISubprogram(name: "go"{{.*}}) // PLAIN-NOT: linkageName: // // UNIQUE: @glob = internal global i32 // UNIQUE: define internal i32 @_ZL3foov.[[MODHASH:__uniq.[0-9]+]]() +// UNIQUE: define internal i32 @bar(i32 %a) +// UNIQUE: define internal i32 @_ZL2goi.[[MODHASH]](i32 %a) // UNIQUE: distinct !DIGlobalVariable(name: "glob"{{.*}}) // UNIQUE: distinct !DISubprogram(name: "foo", linkageName: "_ZL3foov.[[MODHASH]]"{{.*}}) +// UNIQUE: distinct !DISubprogram(name: "go", linkageName: "_ZL2goi.[[MODHASH]]"{{.*}}) -- GitLab From 16370e02a715717dd585537f02eb3e3a3221637e Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Fri, 19 Mar 2021 12:00:06 +0700 Subject: [PATCH 0109/1000] [IndVars] Provide eliminateIVComparison with context We can prove more predicates when we have a context when eliminating ICmp. As first (and very obvious) approximation we can use the ICmp instruction itself, though in the future we are going to use a common dominator of all its users. Need some refactoring before that. Observed ~0.5% negative compile time impact. Differential Revision: https://reviews.llvm.org/D98697 Reviewed By: lebedev.ri --- llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 7 ++++--- llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll | 4 ++-- .../IndVarSimplify/checks_against_min_value.ll | 6 ++---- .../IndVarSimplify/eliminate-comparison.ll | 12 ++++-------- llvm/test/Transforms/LoopLoadElim/pr-49141.ll | 8 +++----- 5 files changed, 15 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index d0c43bb26105..120556fc912d 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -261,9 +261,10 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) { const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop); const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop); - // If the condition is always true or always false, replace it with - // a constant value. - if (auto Ev = SE->evaluatePredicate(Pred, S, X)) { + // If the condition is always true or always false in the given context, + // replace it with a constant value. + // TODO: We can sharpen the context to common dominator of all ICmp's users. + if (auto Ev = SE->evaluatePredicateAt(Pred, S, X, ICmp)) { ICmp->replaceAllUsesWith(ConstantInt::getBool(ICmp->getContext(), *Ev)); DeadInsts.emplace_back(ICmp); LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n'); diff --git a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll index 6d7bbced417d..cd5615a1bc67 100644 --- a/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll +++ b/llvm/test/Transforms/IndVarSimplify/X86/pr35406.ll @@ -23,7 +23,7 @@ define i32 @testDiv(i8* %p, i64* %p1) { ; CHECK-NEXT: [[I4:%.*]] = load atomic i64, i64* [[P1:%.*]] unordered, align 8 ; CHECK-NEXT: [[I6:%.*]] = sub i64 [[I4]], [[INDVARS_IV_NEXT2]] ; CHECK-NEXT: store atomic i64 [[I6]], i64* [[P1]] unordered, align 8 -; CHECK-NEXT: br i1 false, label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] +; CHECK-NEXT: br i1 true, label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] ; CHECK: loop2.exit.loopexit: ; CHECK-NEXT: br label [[LOOP2_EXIT]] ; CHECK: loop2.exit: @@ -94,7 +94,7 @@ define i32 @testRem(i8* %p, i64* %p1) { ; CHECK-NEXT: [[I4:%.*]] = load atomic i64, i64* [[P1:%.*]] unordered, align 8 ; CHECK-NEXT: [[I6:%.*]] = sub i64 [[I4]], [[INDVARS_IV_NEXT]] ; CHECK-NEXT: store atomic i64 [[I6]], i64* [[P1]] unordered, align 8 -; CHECK-NEXT: br i1 false, label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] +; CHECK-NEXT: br i1 true, label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[LOOP2]] ; CHECK: loop2.exit.loopexit: ; CHECK-NEXT: br label [[LOOP2_EXIT]] ; CHECK: loop2.exit: diff --git a/llvm/test/Transforms/IndVarSimplify/checks_against_min_value.ll b/llvm/test/Transforms/IndVarSimplify/checks_against_min_value.ll index 4575a4547fac..2b25daf9573b 100644 --- a/llvm/test/Transforms/IndVarSimplify/checks_against_min_value.ll +++ b/llvm/test/Transforms/IndVarSimplify/checks_against_min_value.ll @@ -15,8 +15,7 @@ define void @test_signed(i32 %start) { ; CHECK-NEXT: [[CHECK:%.*]] = icmp slt i32 [[IV_NEXT]], [[IV]] ; CHECK-NEXT: br i1 [[CHECK]], label [[GUARDED]], label [[FAIL:%.*]] ; CHECK: guarded: -; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ne i32 [[IV]], -2147483648 -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK-NEXT: br i1 true, label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: @@ -58,8 +57,7 @@ define void @test_unsigned(i32 %start) { ; CHECK-NEXT: [[CHECK:%.*]] = icmp ult i32 [[IV_NEXT]], [[IV]] ; CHECK-NEXT: br i1 [[CHECK]], label [[GUARDED]], label [[FAIL:%.*]] ; CHECK: guarded: -; CHECK-NEXT: [[LOOP_COND:%.*]] = icmp ne i32 [[IV]], 0 -; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK-NEXT: br i1 true, label [[LOOP]], label [[EXIT_LOOPEXIT:%.*]] ; CHECK: exit.loopexit: ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll index 3eb7b12dce2a..6e0d7376c8c4 100644 --- a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll +++ b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll @@ -951,11 +951,9 @@ define i32 @func_25(i32 %start) { ; CHECK-NEXT: [[C1:%.*]] = icmp ne i32 [[IV]], 0 ; CHECK-NEXT: br i1 [[C1]], label [[CHECKED_1:%.*]], label [[FAIL:%.*]] ; CHECK: checked.1: -; CHECK-NEXT: [[C2:%.*]] = icmp ne i32 [[IV]], 0 -; CHECK-NEXT: br i1 [[C2]], label [[CHECKED_2:%.*]], label [[FAIL]] +; CHECK-NEXT: br i1 true, label [[CHECKED_2:%.*]], label [[FAIL]] ; CHECK: checked.2: -; CHECK-NEXT: [[C3:%.*]] = icmp ne i32 [[IV]], 0 -; CHECK-NEXT: br i1 [[C3]], label [[BACKEDGE]], label [[FAIL]] +; CHECK-NEXT: br i1 true, label [[BACKEDGE]], label [[FAIL]] ; CHECK: backedge: ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 758394 ; CHECK-NEXT: [[LOOP_COND:%.*]] = call i1 @cond_func() @@ -1003,11 +1001,9 @@ define i32 @func_26(i32 %start) { ; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[IV]], 0 ; CHECK-NEXT: br i1 [[C1]], label [[CHECKED_1:%.*]], label [[FAIL:%.*]] ; CHECK: checked.1: -; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[IV]], 1 -; CHECK-NEXT: br i1 [[C2]], label [[CHECKED_2:%.*]], label [[FAIL]] +; CHECK-NEXT: br i1 true, label [[CHECKED_2:%.*]], label [[FAIL]] ; CHECK: checked.2: -; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[IV]], 2 -; CHECK-NEXT: br i1 [[C3]], label [[BACKEDGE]], label [[FAIL]] +; CHECK-NEXT: br i1 true, label [[BACKEDGE]], label [[FAIL]] ; CHECK: backedge: ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 758394 ; CHECK-NEXT: [[LOOP_COND:%.*]] = call i1 @cond_func() diff --git a/llvm/test/Transforms/LoopLoadElim/pr-49141.ll b/llvm/test/Transforms/LoopLoadElim/pr-49141.ll index df7a97581d6a..8c7b5f163419 100644 --- a/llvm/test/Transforms/LoopLoadElim/pr-49141.ll +++ b/llvm/test/Transforms/LoopLoadElim/pr-49141.ll @@ -8,12 +8,10 @@ define void @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: br i1 true, label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[A_01:%.*]] = phi i16 [ undef, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[INC]] = add nsw i16 [[A_01]], 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp sle i16 [[INC]], 2 -; CHECK-NEXT: [[OR_COND:%.*]] = and i1 false, [[CMP]] -; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK-NEXT: br i1 false, label [[FOR_COND:%.*]], label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; -- GitLab From 270a336ff46204acf887def32c92ad695f767471 Mon Sep 17 00:00:00 2001 From: Vladislav Vinogradov Date: Fri, 19 Mar 2021 05:32:19 +0000 Subject: [PATCH 0110/1000] [mlir] Fix Python bindings tests failure in Debug mode after D98474 Add extra `type.isa()` check to `FloatAttr::get(Type, double)` method. Otherwise it tries to call `type.cast()`, which fails with assertion in Debug mode. The `!type.isa()` case just redirercts the call to `FloatAttr::get(Type, APFloat)`, which will perform the actual check and emit appropriate error. Reviewed By: mehdi_amini Differential Revision: https://reviews.llvm.org/D98764 --- mlir/include/mlir/IR/BuiltinAttributes.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/include/mlir/IR/BuiltinAttributes.td b/mlir/include/mlir/IR/BuiltinAttributes.td index 433c33521a7a..45214535b1f8 100644 --- a/mlir/include/mlir/IR/BuiltinAttributes.td +++ b/mlir/include/mlir/IR/BuiltinAttributes.td @@ -406,7 +406,7 @@ def Builtin_FloatAttr : Builtin_Attr<"Float"> { return $_get(type.getContext(), type, value); }]>, AttrBuilderWithInferredContext<(ins "Type":$type, "double":$value), [{ - if (type.isF64()) + if (type.isF64() || !type.isa()) return $_get(type.getContext(), type, APFloat(value)); // This handles, e.g., F16 because there is no APFloat constructor for it. -- GitLab From f178c13fa89960c7247a6367269919acf87fd1b3 Mon Sep 17 00:00:00 2001 From: Andrew Young Date: Thu, 18 Mar 2021 20:06:02 -0700 Subject: [PATCH 0111/1000] [mlir] Support use-def cycles in graph regions during regionDCE When deleting operations in DCE, the algorithm uses a post-order walk of the IR to ensure that value uses were erased before value defs. Graph regions do not have the same structural invariants as SSA CFG, and this post order walk could delete value defs before uses. This problem is guaranteed to occur when there is a cycle in the use-def graph. This change stops DCE from visiting the operations and blocks in any meaningful order. Instead, we rely on explicitly dropping all uses of a value before deleting it. Reviewed By: mehdi_amini, rriddle Differential Revision: https://reviews.llvm.org/D98919 --- mlir/lib/Transforms/Utils/RegionUtils.cpp | 15 ++++++--------- mlir/test/Transforms/canonicalize-dce.mlir | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp index 7dd064ef0341..21d0ff53fdc8 100644 --- a/mlir/lib/Transforms/Utils/RegionUtils.cpp +++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp @@ -312,21 +312,18 @@ static LogicalResult deleteDeadness(MutableArrayRef regions, if (region.empty()) continue; - // We do the deletion in an order that deletes all uses before deleting - // defs. - // MLIR's SSA structural invariants guarantee that except for block - // arguments, the use-def graph is acyclic, so this is possible with a - // single walk of ops and then a final pass to clean up block arguments. - // - // To do this, we visit ops in an order that visits domtree children - // before domtree parents. A CFG post-order (with reverse iteration with a - // block) satisfies that without needing an explicit domtree calculation. + // Delete every operation that is not live. Graph regions may have cycles + // in the use-def graph, so we must explicitly dropAllUses() from each + // operation as we erase it. Visiting the operations in post-order + // guarantees that in SSA CFG regions value uses are removed before defs, + // which makes dropAllUses() a no-op. for (Block *block : llvm::post_order(®ion.front())) { eraseTerminatorSuccessorOperands(block->getTerminator(), liveMap); for (Operation &childOp : llvm::make_early_inc_range(llvm::reverse(block->getOperations()))) { if (!liveMap.wasProvenLive(&childOp)) { erasedAnything = true; + childOp.dropAllUses(); childOp.erase(); } else { erasedAnything |= diff --git a/mlir/test/Transforms/canonicalize-dce.mlir b/mlir/test/Transforms/canonicalize-dce.mlir index 4a351dca426e..e96bd65d389a 100644 --- a/mlir/test/Transforms/canonicalize-dce.mlir +++ b/mlir/test/Transforms/canonicalize-dce.mlir @@ -156,3 +156,20 @@ func @f( "foo.print"(%t4) : (tensor<4xf32>) -> () return } + +// ----- + +// Test case: Test values with use-def cycles are deleted properly. + +// CHECK: func @f() +// CHECK-NEXT: test.graph_region +// CHECK-NEXT: "test.terminator"() : () -> () + +func @f() { + test.graph_region { + %0 = "math.exp"(%1) : (f32) -> f32 + %1 = "math.exp"(%0) : (f32) -> f32 + "test.terminator"() : ()->() + } + return +} -- GitLab From c241659d1573b0c89fa4d6591d7bd9d3fc84e37a Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 18 Mar 2021 23:22:58 -0700 Subject: [PATCH 0112/1000] [X86] Fix -Wunused-function in -DLLVM_ENABLE_ASSERTIONS=off builds --- llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp index 134df5d9569c..e267ba44e28b 100644 --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -43,12 +43,14 @@ using namespace PatternMatch; #define DEBUG_TYPE "lower-amx-intrinsics" +#ifndef NDEBUG static bool isV256I32Ty(Type *Ty) { if (auto *FVT = dyn_cast(Ty)) return FVT->getNumElements() == 256 && FVT->getElementType()->isIntegerTy(32); return false; } +#endif namespace { class X86LowerAMXIntrinsics { -- GitLab From ce97d8e6c7409501e9b42de3db34ae0486115e25 Mon Sep 17 00:00:00 2001 From: Petr Hosek Date: Thu, 18 Mar 2021 23:42:31 -0700 Subject: [PATCH 0113/1000] Revert "[WoA][MSVC] Use default linker setting in MSVC-compatible driver" This reverts commit ace56d41aca8cac7cead9c2c97278aa50fc945b1 which broke builders that set CLANG_DEFAULT_LINKER. --- clang/lib/Driver/ToolChains/MSVC.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 38ad7125b4af..96de02378ca2 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -11,7 +11,6 @@ #include "Darwin.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Version.h" -#include "clang/Config/config.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" @@ -578,10 +577,7 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA, // translate 'lld' into 'lld-link', and in the case of the regular msvc // linker, we need to use a special search algorithm. llvm::SmallString<128> linkPath; - StringRef Linker = Args.getLastArgValue(options::OPT_fuse_ld_EQ, - CLANG_DEFAULT_LINKER); - if (Linker.empty()) - Linker = "link"; + StringRef Linker = Args.getLastArgValue(options::OPT_fuse_ld_EQ, "link"); if (Linker.equals_lower("lld")) Linker = "lld-link"; -- GitLab From 8bb952b57fac8b9a37dc132f94df7adc697b10bb Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Fri, 19 Mar 2021 13:07:57 +0700 Subject: [PATCH 0114/1000] [NFC] Factor out utility function for finding common dom of user set --- llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 34 ++++++++++++-------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 120556fc912d..538141132292 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -1477,6 +1477,24 @@ bool WidenIV::widenLoopCompare(WidenIV::NarrowIVDefUse DU) { return true; } +/// Find a point in code which dominates all given instructions. We can safely +/// assume that, whatever fact we can prove at the found point, this fact is +/// also true for each of the given instructions. +static Instruction *findCommonDominator(ArrayRef Instructions, + DominatorTree &DT) { + Instruction *CommonDom = nullptr; + for (auto *Insn : Instructions) + if (!CommonDom || DT.dominates(Insn, CommonDom)) + CommonDom = Insn; + else if (!DT.dominates(CommonDom, Insn)) + // If there is no dominance relation, use common dominator. + CommonDom = + DT.findNearestCommonDominator(CommonDom->getParent(), + Insn->getParent())->getTerminator(); + assert(CommonDom && "Common dominator not found?"); + return CommonDom; +} + // The widenIVUse avoids generating trunc by evaluating the use as AddRec, this // will not work when: // 1) SCEV traces back to an instruction inside the loop that SCEV can not @@ -1572,17 +1590,7 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) { // We'll prove some facts that should be true in the context of ext users. If // there is no users, we are done now. If there are some, pick their common // dominator as context. - Instruction *Context = nullptr; - for (auto *Ext : ExtUsers) { - if (!Context || DT->dominates(Ext, Context)) - Context = Ext; - else if (!DT->dominates(Context, Ext)) - // For users that don't have dominance relation, use common dominator. - Context = - DT->findNearestCommonDominator(Context->getParent(), Ext->getParent()) - ->getTerminator(); - } - assert(Context && "Context not found?"); + const Instruction *CtxI = findCommonDominator(ExtUsers, *DT); if (!CanSignExtend && !CanZeroExtend) { // Because InstCombine turns 'sub nuw' to 'add' losing the no-wrap flag, we @@ -1598,8 +1606,8 @@ bool WidenIV::widenWithVariantUse(WidenIV::NarrowIVDefUse DU) { return false; if (!SE->isKnownNegative(RHS)) return false; - bool ProvedSubNUW = SE->isKnownPredicateAt( - ICmpInst::ICMP_UGE, LHS, SE->getNegativeSCEV(RHS), Context); + bool ProvedSubNUW = SE->isKnownPredicateAt(ICmpInst::ICMP_UGE, LHS, + SE->getNegativeSCEV(RHS), CtxI); if (!ProvedSubNUW) return false; // In fact, our 'add' is 'sub nuw'. We will need to widen the 2nd operand as -- GitLab From 8eefa07fcfe7b5d4d5827c071e494ecb78c7815c Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Fri, 19 Mar 2021 14:03:31 +0700 Subject: [PATCH 0115/1000] [NFC] Move function up in code --- llvm/lib/Transforms/Utils/SimplifyIndVar.cpp | 36 ++++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp index 538141132292..f0e446684801 100644 --- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -99,6 +99,24 @@ namespace { }; } +/// Find a point in code which dominates all given instructions. We can safely +/// assume that, whatever fact we can prove at the found point, this fact is +/// also true for each of the given instructions. +static Instruction *findCommonDominator(ArrayRef Instructions, + DominatorTree &DT) { + Instruction *CommonDom = nullptr; + for (auto *Insn : Instructions) + if (!CommonDom || DT.dominates(Insn, CommonDom)) + CommonDom = Insn; + else if (!DT.dominates(CommonDom, Insn)) + // If there is no dominance relation, use common dominator. + CommonDom = + DT.findNearestCommonDominator(CommonDom->getParent(), + Insn->getParent())->getTerminator(); + assert(CommonDom && "Common dominator not found?"); + return CommonDom; +} + /// Fold an IV operand into its use. This removes increments of an /// aligned IV when used by a instruction that ignores the low bits. /// @@ -1477,24 +1495,6 @@ bool WidenIV::widenLoopCompare(WidenIV::NarrowIVDefUse DU) { return true; } -/// Find a point in code which dominates all given instructions. We can safely -/// assume that, whatever fact we can prove at the found point, this fact is -/// also true for each of the given instructions. -static Instruction *findCommonDominator(ArrayRef Instructions, - DominatorTree &DT) { - Instruction *CommonDom = nullptr; - for (auto *Insn : Instructions) - if (!CommonDom || DT.dominates(Insn, CommonDom)) - CommonDom = Insn; - else if (!DT.dominates(CommonDom, Insn)) - // If there is no dominance relation, use common dominator. - CommonDom = - DT.findNearestCommonDominator(CommonDom->getParent(), - Insn->getParent())->getTerminator(); - assert(CommonDom && "Common dominator not found?"); - return CommonDom; -} - // The widenIVUse avoids generating trunc by evaluating the use as AddRec, this // will not work when: // 1) SCEV traces back to an instruction inside the loop that SCEV can not -- GitLab From 4ee4f9bf4ae49df25b46351a0bfca3a36e7bf82d Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Fri, 19 Mar 2021 14:17:35 +0700 Subject: [PATCH 0116/1000] [Test] Precommit test --- .../IndVarSimplify/eliminate-comparison.ll | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll index 6e0d7376c8c4..c367176b4b59 100644 --- a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll +++ b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll @@ -1042,5 +1042,56 @@ exit: ret i32 %iv } +define i32 @func_27(i32 %start) { +; CHECK-LABEL: @func_27( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[C1:%.*]] = icmp slt i32 [[IV]], 2 +; CHECK-NEXT: [[C2:%.*]] = icmp slt i32 [[IV]], 1 +; CHECK-NEXT: [[C3:%.*]] = icmp slt i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[C1]], label [[CHECKED_1:%.*]], label [[FAIL:%.*]] +; CHECK: checked.1: +; CHECK-NEXT: br i1 [[C2]], label [[CHECKED_2:%.*]], label [[FAIL]] +; CHECK: checked.2: +; CHECK-NEXT: br i1 [[C3]], label [[BACKEDGE]], label [[FAIL]] +; CHECK: backedge: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 758394 +; CHECK-NEXT: [[LOOP_COND:%.*]] = call i1 @cond_func() +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: fail: +; CHECK-NEXT: unreachable +; CHECK: exit: +; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA1]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [%start, %entry], [%iv.next, %backedge] + %c1 = icmp slt i32 %iv, 2 + %c2 = icmp slt i32 %iv, 1 + %c3 = icmp slt i32 %iv, 0 + br i1 %c1, label %checked.1, label %fail + +checked.1: + br i1 %c2, label %checked.2, label %fail + +checked.2: + br i1 %c3, label %backedge, label %fail + +backedge: + %iv.next = add i32 %iv, 758394 + %loop.cond = call i1 @cond_func() + br i1 %loop.cond, label %loop, label %exit + +fail: + unreachable + +exit: + ret i32 %iv +} !0 = !{i32 0, i32 2147483647} -- GitLab From a825fb2c07337cc2c84783558e91416e07adcf42 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Fri, 19 Mar 2021 00:22:50 -0700 Subject: [PATCH 0117/1000] [mlir] Remove mlir-rocm-runner This change combines for ROCm what was done for CUDA in D97463, D98203, D98360, and D98396. I did not try to compile SerializeToHsaco.cpp or test mlir/test/Integration/GPU/ROCM because I don't have an AMD card. I fixed the things that had obvious bit-rot though. Reviewed By: whchung Differential Revision: https://reviews.llvm.org/D98447 --- mlir/include/mlir/Dialect/GPU/Passes.h | 4 + mlir/include/mlir/InitAllPasses.h | 1 + mlir/lib/Dialect/GPU/CMakeLists.txt | 67 ++++ .../GPU/Transforms/SerializeToHsaco.cpp | 284 ++++++++++++++ mlir/lib/ExecutionEngine/CMakeLists.txt | 49 +++ .../ExecutionEngine/RocmRuntimeWrappers.cpp} | 30 +- mlir/test/CMakeLists.txt | 13 +- .../lower-rocdl-kernel-to-hsaco.mlir | 7 +- mlir/test/Integration/GPU/CUDA/lit.local.cfg | 2 +- .../GPU/ROCM}/gpu-to-hsaco.mlir | 8 +- .../GPU/ROCM}/lit.local.cfg | 0 .../GPU/ROCM}/two-modules.mlir | 12 +- .../GPU/ROCM}/vecadd.mlir | 9 +- .../GPU/ROCM}/vector-transferops.mlir | 9 +- .../TestConvertGPUKernelToHsaco.cpp | 60 +-- mlir/test/lit.cfg.py | 1 - mlir/test/lit.site.cfg.py.in | 1 - mlir/tools/CMakeLists.txt | 1 - mlir/tools/mlir-opt/mlir-opt.cpp | 4 +- mlir/tools/mlir-rocm-runner/CMakeLists.txt | 127 ------- .../mlir-rocm-runner/mlir-rocm-runner.cpp | 349 ------------------ 21 files changed, 494 insertions(+), 544 deletions(-) create mode 100644 mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp rename mlir/{tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp => lib/ExecutionEngine/RocmRuntimeWrappers.cpp} (91%) rename mlir/test/{mlir-rocm-runner => Integration/GPU/ROCM}/gpu-to-hsaco.mlir (82%) rename mlir/test/{mlir-rocm-runner => Integration/GPU/ROCM}/lit.local.cfg (100%) rename mlir/test/{mlir-rocm-runner => Integration/GPU/ROCM}/two-modules.mlir (74%) rename mlir/test/{mlir-rocm-runner => Integration/GPU/ROCM}/vecadd.mlir (87%) rename mlir/test/{mlir-rocm-runner => Integration/GPU/ROCM}/vector-transferops.mlir (90%) delete mode 100644 mlir/tools/mlir-rocm-runner/CMakeLists.txt delete mode 100644 mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h index 6a6a2c0678b6..bfb5626fca19 100644 --- a/mlir/include/mlir/Dialect/GPU/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Passes.h @@ -90,6 +90,10 @@ protected: /// annotation. void registerGpuSerializeToCubinPass(); +/// Register pass to serialize GPU kernel functions to a HSAco binary +/// annotation. +void registerGpuSerializeToHsacoPass(); + /// Generate the code for registering passes. #define GEN_PASS_REGISTRATION #include "mlir/Dialect/GPU/Passes.h.inc" diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h index 029df0735959..ab9629ac86c1 100644 --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -52,6 +52,7 @@ inline void registerAllPasses() { registerAsyncPasses(); registerGPUPasses(); registerGpuSerializeToCubinPass(); + registerGpuSerializeToHsacoPass(); registerLinalgPasses(); LLVM::registerLLVMPasses(); quant::registerQuantPasses(); diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index d7fbfe0b5b61..ea70029c849e 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -6,6 +6,16 @@ if (MLIR_CUDA_CONVERSIONS_ENABLED) ) endif() +if (MLIR_ROCM_CONVERSIONS_ENABLED) + set(AMDGPU_LIBS + MCParser + AMDGPUAsmParser + AMDGPUCodeGen + AMDGPUDesc + AMDGPUInfo + ) +endif() + add_mlir_dialect_library(MLIRGPU IR/GPUDialect.cpp Transforms/AllReduceLowering.cpp @@ -15,6 +25,7 @@ add_mlir_dialect_library(MLIRGPU Transforms/ParallelLoopMapper.cpp Transforms/SerializeToBlob.cpp Transforms/SerializeToCubin.cpp + Transforms/SerializeToHsaco.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU @@ -23,6 +34,7 @@ add_mlir_dialect_library(MLIRGPU Core MC ${NVPTX_LIBS} + ${AMDGPU_LIBS} DEPENDS MLIRGPUOpsIncGen @@ -84,3 +96,58 @@ if(MLIR_CUDA_RUNNER_ENABLED) ) endif() + +if(MLIR_ROCM_RUNNER_ENABLED) + if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)) + message(SEND_ERROR + "Building mlir with ROCm support requires the AMDGPU backend") + endif() + + # Ensure lld is enabled. + if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS) + message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS") + endif() + + # Configure ROCm support. + if (NOT DEFINED ROCM_PATH) + if (NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") + else() + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") + endif() + set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed") + endif() + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + find_package(HIP) + if (NOT HIP_FOUND) + message(SEND_ERROR "Building mlir with ROCm support requires a working ROCm and HIP install") + else() + message(STATUS "ROCm HIP version: ${HIP_VERSION}") + endif() + + target_compile_definitions(obj.MLIRGPU + PRIVATE + __HIP_PLATFORM_HCC__ + __ROCM_PATH__="${ROCM_PATH}" + MLIR_GPU_TO_HSACO_PASS_ENABLE=1 + ) + + target_include_directories(obj.MLIRGPU + PRIVATE + ${MLIR_SOURCE_DIR}/../lld/include + ${HIP_PATH}/include + ${ROCM_PATH}/include + ) + + target_link_libraries(MLIRGPU + PRIVATE + lldELF + MLIRROCDLToLLVMIRTranslation + ) + + # Link lldELF also to libmlir.so. Create an alias that starts with LLVM + # because LINK_COMPONENTS elements are implicitly prefixed with LLVM. + add_library(LLVMAliasTolldELF ALIAS lldELF) + set_property(GLOBAL APPEND PROPERTY MLIR_LLVM_LINK_COMPONENTS AliasTolldELF) + +endif() diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp new file mode 100644 index 000000000000..1369c1e57549 --- /dev/null +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp @@ -0,0 +1,284 @@ +//===- LowerGPUToHSACO.cpp - Convert GPU kernel to HSACO blob -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass that serializes a gpu module into HSAco blob and +// adds that blob as a string attribute of the module. +// +//===----------------------------------------------------------------------===// +#include "mlir/Dialect/GPU/Passes.h" + +#if MLIR_GPU_TO_HSACO_PASS_ENABLE +#include "mlir/Pass/Pass.h" +#include "mlir/Support/FileUtilities.h" +#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" +#include "mlir/Target/LLVMIR/Export.h" + +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCAsmInfo.h" +#include "llvm/MC/MCCodeEmitter.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCObjectFileInfo.h" +#include "llvm/MC/MCObjectWriter.h" +#include "llvm/MC/MCParser/MCTargetAsmParser.h" +#include "llvm/MC/MCStreamer.h" +#include "llvm/MC/MCSubtargetInfo.h" + +#include "llvm/Support/FileUtilities.h" +#include "llvm/Support/LineIterator.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/WithColor.h" +#include "llvm/Target/TargetOptions.h" + +#include "lld/Common/Driver.h" + +#include "hip/hip_version.h" + +#include + +using namespace mlir; + +namespace { +class SerializeToHsacoPass + : public PassWrapper { +public: + SerializeToHsacoPass(); + +private: + void getDependentDialects(DialectRegistry ®istry) const override; + + // Serializes ROCDL to HSACO. + std::unique_ptr> + serializeISA(const std::string &isa) override; + + std::unique_ptr> assembleIsa(const std::string &isa); + std::unique_ptr> + createHsaco(const SmallVectorImpl &isaBinary); +}; +} // namespace + +static std::string getDefaultChip() { + const char kDefaultChip[] = "gfx900"; + + // Locate rocm_agent_enumerator. + const char kRocmAgentEnumerator[] = "rocm_agent_enumerator"; + llvm::ErrorOr rocmAgentEnumerator = llvm::sys::findProgramByName( + kRocmAgentEnumerator, {__ROCM_PATH__ "/bin"}); + if (!rocmAgentEnumerator) { + llvm::WithColor::warning(llvm::errs()) + << kRocmAgentEnumerator << "couldn't be located under " << __ROCM_PATH__ + << "/bin\n"; + return kDefaultChip; + } + + // Prepare temp file to hold the outputs. + int tempFd = -1; + SmallString<128> tempFilename; + if (llvm::sys::fs::createTemporaryFile("rocm_agent", "txt", tempFd, + tempFilename)) { + llvm::WithColor::warning(llvm::errs()) + << "temporary file for " << kRocmAgentEnumerator << " creation error\n"; + return kDefaultChip; + } + llvm::FileRemover cleanup(tempFilename); + + // Invoke rocm_agent_enumerator. + std::string errorMessage; + SmallVector args{"-t", "GPU"}; + Optional redirects[3] = {{""}, tempFilename.str(), {""}}; + int result = + llvm::sys::ExecuteAndWait(rocmAgentEnumerator.get(), args, llvm::None, + redirects, 0, 0, &errorMessage); + if (result) { + llvm::WithColor::warning(llvm::errs()) + << kRocmAgentEnumerator << " invocation error: " << errorMessage + << "\n"; + return kDefaultChip; + } + + // Load and parse the result. + auto gfxIsaList = openInputFile(tempFilename); + if (!gfxIsaList) { + llvm::WithColor::error(llvm::errs()) + << "read ROCm agent list temp file error\n"; + return kDefaultChip; + } + for (llvm::line_iterator lines(*gfxIsaList); !lines.is_at_end(); ++lines) { + // Skip the line with content "gfx000". + if (*lines == "gfx000") + continue; + // Use the first ISA version found. + return lines->str(); + } + + return kDefaultChip; +} + +// Sets the 'option' to 'value' unless it already has a value. +static void maybeSetOption(Pass::Option &option, + function_ref getValue) { + if (!option.hasValue()) + option = getValue(); +} + +SerializeToHsacoPass::SerializeToHsacoPass() { + maybeSetOption(this->triple, [] { return "amdgcn-amd-amdhsa"; }); + maybeSetOption(this->chip, [] { + static auto chip = getDefaultChip(); + return chip; + }); +} + +void SerializeToHsacoPass::getDependentDialects( + DialectRegistry ®istry) const { + registerROCDLDialectTranslation(registry); + gpu::SerializeToBlobPass::getDependentDialects(registry); +} + +std::unique_ptr> +SerializeToHsacoPass::assembleIsa(const std::string &isa) { + auto loc = getOperation().getLoc(); + + SmallVector result; + llvm::raw_svector_ostream os(result); + + llvm::Triple triple(llvm::Triple::normalize(this->triple)); + std::string error; + const llvm::Target *target = + llvm::TargetRegistry::lookupTarget(triple.normalize(), error); + if (!target) { + emitError(loc, Twine("failed to lookup target: ") + error); + return {}; + } + + llvm::SourceMgr srcMgr; + srcMgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(isa), + llvm::SMLoc()); + + const llvm::MCTargetOptions mcOptions; + std::unique_ptr mri( + target->createMCRegInfo(this->triple)); + std::unique_ptr mai( + target->createMCAsmInfo(*mri, this->triple, mcOptions)); + mai->setRelaxELFRelocations(true); + + llvm::MCObjectFileInfo mofi; + llvm::MCContext ctx(mai.get(), mri.get(), &mofi, &srcMgr, &mcOptions); + mofi.InitMCObjectFileInfo(triple, false, ctx, false); + + SmallString<128> cwd; + if (!llvm::sys::fs::current_path(cwd)) + ctx.setCompilationDir(cwd); + + std::unique_ptr mcStreamer; + std::unique_ptr mcii(target->createMCInstrInfo()); + std::unique_ptr sti( + target->createMCSubtargetInfo(this->triple, this->chip, this->features)); + + llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, *mri, ctx); + llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions); + mcStreamer.reset(target->createMCObjectStreamer( + triple, ctx, std::unique_ptr(mab), + mab->createObjectWriter(os), std::unique_ptr(ce), + *sti, mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible, + /*DWARFMustBeAtTheEnd*/ false)); + mcStreamer->setUseAssemblerInfoForParsing(true); + + std::unique_ptr parser( + createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai)); + std::unique_ptr tap( + target->createMCAsmParser(*sti, *parser, *mcii, mcOptions)); + + if (!tap) { + emitError(loc, "assembler initialization error"); + return {}; + } + + parser->setTargetParser(*tap); + parser->Run(false); + + return std::make_unique>(std::move(result)); +} + +std::unique_ptr> +SerializeToHsacoPass::createHsaco(const SmallVectorImpl &isaBinary) { + auto loc = getOperation().getLoc(); + + // Save the ISA binary to a temp file. + int tempIsaBinaryFd = -1; + SmallString<128> tempIsaBinaryFilename; + if (llvm::sys::fs::createTemporaryFile("kernel", "o", tempIsaBinaryFd, + tempIsaBinaryFilename)) { + emitError(loc, "temporary file for ISA binary creation error"); + return {}; + } + llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename); + llvm::raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true); + tempIsaBinaryOs << StringRef(isaBinary.data(), isaBinary.size()); + tempIsaBinaryOs.close(); + + // Create a temp file for HSA code object. + int tempHsacoFD = -1; + SmallString<128> tempHsacoFilename; + if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD, + tempHsacoFilename)) { + emitError(loc, "temporary file for HSA code object creation error"); + return {}; + } + llvm::FileRemover cleanupHsaco(tempHsacoFilename); + + { + static std::mutex mutex; + const std::lock_guard lock(mutex); + // Invoke lld. Expect a true return value from lld. + if (!lld::elf::link({"ld.lld", "-shared", tempIsaBinaryFilename.c_str(), + "-o", tempHsacoFilename.c_str()}, + /*canEarlyExit=*/false, llvm::outs(), llvm::errs())) { + emitError(loc, "lld invocation error"); + return {}; + } + } + + // Load the HSA code object. + auto hsacoFile = openInputFile(tempHsacoFilename); + if (!hsacoFile) { + emitError(loc, "read HSA code object from temp file error"); + return {}; + } + + StringRef buffer = hsacoFile->getBuffer(); + return std::make_unique>(buffer.begin(), buffer.end()); +} + +std::unique_ptr> +SerializeToHsacoPass::serializeISA(const std::string &isa) { + auto isaBinary = assembleIsa(isa); + if (!isaBinary) + return {}; + return createHsaco(*isaBinary); +} + +// Register pass to serialize GPU kernel functions to a HSACO binary annotation. +void mlir::registerGpuSerializeToHsacoPass() { + PassRegistration registerSerializeToHSACO( + "gpu-to-hsaco", "Lower GPU kernel function to HSACO binary annotations", + [] { + // Initialize LLVM AMDGPU backend. + LLVMInitializeAMDGPUAsmParser(); + LLVMInitializeAMDGPUAsmPrinter(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTargetMC(); + + return std::make_unique(); + }); +} +#else // MLIR_GPU_TO_HSACO_PASS_ENABLE +void mlir::registerGpuSerializeToHsacoPass() {} +#endif // MLIR_GPU_TO_HSACO_PASS_ENABLE diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt index b9176cf1e89b..978bf1adedd5 100644 --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -7,6 +7,7 @@ set(LLVM_OPTIONAL_SOURCES CudaRuntimeWrappers.cpp SparseUtils.cpp ExecutionEngine.cpp + RocmRuntimeWrappers.cpp RunnerUtils.cpp OptUtils.cpp JitRunner.cpp @@ -136,3 +137,51 @@ if(MLIR_CUDA_RUNNER_ENABLED) ${CUDA_RUNTIME_LIBRARY} ) endif() + +if(MLIR_ROCM_RUNNER_ENABLED) + # Configure ROCm support. + if (NOT DEFINED ROCM_PATH) + if (NOT DEFINED ENV{ROCM_PATH}) + set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") + else() + set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") + endif() + set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH "Path to which HIP has been installed") + endif() + set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + find_package(HIP) + if (NOT HIP_FOUND) + message(SEND_ERROR "Building mlir with ROCm support requires a working ROCm and HIP install") + else() + message(STATUS "ROCm HIP version: ${HIP_VERSION}") + endif() + + # Locate HIP runtime library. + find_library(ROCM_RUNTIME_LIBRARY amdhip64 + PATHS "${HIP_PATH}/lib") + if (NOT ROCM_RUNTIME_LIBRARY) + message(SEND_ERROR "Could not locate ROCm HIP runtime library") + else() + message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}") + endif() + + add_mlir_library(mlir_rocm_runtime + SHARED + RocmRuntimeWrappers.cpp + + EXCLUDE_FROM_LIBMLIR + ) + target_compile_definitions(mlir_rocm_runtime + PRIVATE + __HIP_PLATFORM_HCC__ + ) + target_include_directories(mlir_rocm_runtime + PRIVATE + ${HIP_PATH}/include + ${ROCM_PATH}/include + ) + target_link_libraries(mlir_rocm_runtime + PRIVATE + ${ROCM_RUNTIME_LIBRARY} + ) +endif() diff --git a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp similarity index 91% rename from mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp rename to mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp index 361ba8f8529d..399a37331060 100644 --- a/mlir/tools/mlir-rocm-runner/rocm-runtime-wrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -1,4 +1,4 @@ -//===- rocm-runtime-wrappers.cpp - MLIR ROCM runner wrapper library -------===// +//===- RocmRuntimeWrappers.cpp - MLIR ROCM runtime wrapper library --------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -30,29 +30,25 @@ fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ }(expr) -// Static reference to HIP primary context for device ordinal 0. -static hipCtx_t Context = [] { - HIP_REPORT_IF_ERROR(hipInit(/*flags=*/0)); - hipDevice_t device; - HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /*ordinal=*/0)); - hipCtx_t context; - HIP_REPORT_IF_ERROR(hipDevicePrimaryCtxRetain(&context, device)); - return context; -}(); - // Sets the `Context` for the duration of the instance and restores the previous // context on destruction. class ScopedContext { public: ScopedContext() { - HIP_REPORT_IF_ERROR(hipCtxGetCurrent(&previous)); - HIP_REPORT_IF_ERROR(hipCtxSetCurrent(Context)); + // Static reference to HIP primary context for device ordinal 0. + static hipCtx_t context = [] { + HIP_REPORT_IF_ERROR(hipInit(/*flags=*/0)); + hipDevice_t device; + HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /*ordinal=*/0)); + hipCtx_t ctx; + HIP_REPORT_IF_ERROR(hipDevicePrimaryCtxRetain(&ctx, device)); + return ctx; + }(); + + HIP_REPORT_IF_ERROR(hipCtxPushCurrent(context)); } - ~ScopedContext() { HIP_REPORT_IF_ERROR(hipCtxSetCurrent(previous)); } - -private: - hipCtx_t previous; + ~ScopedContext() { HIP_REPORT_IF_ERROR(hipCtxPopCurrent(nullptr)); } }; extern "C" hipModule_t mgpuModuleLoad(void *data) { diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt index 69d123d02047..775a462db53d 100644 --- a/mlir/test/CMakeLists.txt +++ b/mlir/test/CMakeLists.txt @@ -21,8 +21,7 @@ set(MLIR_DIALECT_LINALG_INTEGRATION_TEST_LIB_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTOR set(MLIR_RUNNER_UTILS_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) # Passed to lit.site.cfg.py.in to set up the path where to find the libraries -# for the mlir rocm / spirv / vulkan runner tests. -set(MLIR_ROCM_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) +# for the mlir spirv / vulkan runner tests. set(MLIR_SPIRV_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) set(MLIR_VULKAN_WRAPPER_LIBRARY_DIR ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}) @@ -75,6 +74,10 @@ if(MLIR_CUDA_RUNNER_ENABLED) list(APPEND MLIR_TEST_DEPENDS mlir_cuda_runtime) endif() +if(MLIR_ROCM_RUNNER_ENABLED) + list(APPEND MLIR_TEST_DEPENDS mlir_rocm_runtime) +endif() + list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests) if(LLVM_BUILD_EXAMPLES) @@ -89,12 +92,6 @@ if(LLVM_BUILD_EXAMPLES) ) endif() -if(MLIR_ROCM_RUNNER_ENABLED) - list(APPEND MLIR_TEST_DEPENDS - mlir-rocm-runner - ) -endif() - if(MLIR_SPIRV_CPU_RUNNER_ENABLED) add_subdirectory(mlir-spirv-cpu-runner) list(APPEND MLIR_TEST_DEPENDS diff --git a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir index 3d7deb906e77..fb19ac6491b3 100644 --- a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir +++ b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir @@ -1,6 +1,6 @@ -// RUN: mlir-opt %s --test-kernel-to-hsaco -split-input-file | FileCheck %s +// RUN: mlir-opt %s --test-gpu-to-hsaco | FileCheck %s -// CHECK: attributes {rocdl.hsaco = "HSACO"} +// CHECK: gpu.module @foo attributes {gpu.binary = "HSACO"} gpu.module @foo { llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr) // CHECK: attributes {gpu.kernel} @@ -9,8 +9,7 @@ gpu.module @foo { } } -// ----- - +// CHECK: gpu.module @bar attributes {gpu.binary = "HSACO"} gpu.module @bar { // CHECK: func @kernel_a llvm.func @kernel_a() diff --git a/mlir/test/Integration/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/GPU/CUDA/lit.local.cfg index b063ddda7e1d..0bdebfedeee3 100644 --- a/mlir/test/Integration/GPU/CUDA/lit.local.cfg +++ b/mlir/test/Integration/GPU/CUDA/lit.local.cfg @@ -1,2 +1,2 @@ if not config.enable_cuda_runner: - config.unsupported = True \ No newline at end of file + config.unsupported = True diff --git a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir similarity index 82% rename from mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir rename to mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir index 3f2d44fca38f..fdc525bd2659 100644 --- a/mlir/test/mlir-rocm-runner/gpu-to-hsaco.mlir +++ b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir @@ -1,5 +1,9 @@ -// RUN: mlir-rocm-runner %s \ -// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ +// RUN: mlir-opt %s \ +// RUN: -gpu-kernel-outlining \ +// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \ +// RUN: -gpu-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s diff --git a/mlir/test/mlir-rocm-runner/lit.local.cfg b/mlir/test/Integration/GPU/ROCM/lit.local.cfg similarity index 100% rename from mlir/test/mlir-rocm-runner/lit.local.cfg rename to mlir/test/Integration/GPU/ROCM/lit.local.cfg diff --git a/mlir/test/mlir-rocm-runner/two-modules.mlir b/mlir/test/Integration/GPU/ROCM/two-modules.mlir similarity index 74% rename from mlir/test/mlir-rocm-runner/two-modules.mlir rename to mlir/test/Integration/GPU/ROCM/two-modules.mlir index 7c0faae5d135..3c6c56b0091a 100644 --- a/mlir/test/mlir-rocm-runner/two-modules.mlir +++ b/mlir/test/Integration/GPU/ROCM/two-modules.mlir @@ -1,5 +1,9 @@ -// RUN: mlir-rocm-runner %s \ -// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ +// RUN: mlir-opt %s \ +// RUN: -gpu-kernel-outlining \ +// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \ +// RUN: -gpu-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s @@ -30,5 +34,5 @@ func @main() { return } -func @mgpuMemGetDeviceMemRef1dInt32(%ptr : memref) -> (memref) -func @print_memref_i32(%ptr : memref<*xi32>) +func private @mgpuMemGetDeviceMemRef1dInt32(%ptr : memref) -> (memref) +func private @print_memref_i32(%ptr : memref<*xi32>) diff --git a/mlir/test/mlir-rocm-runner/vecadd.mlir b/mlir/test/Integration/GPU/ROCM/vecadd.mlir similarity index 87% rename from mlir/test/mlir-rocm-runner/vecadd.mlir rename to mlir/test/Integration/GPU/ROCM/vecadd.mlir index d4dc862c60b6..917be3c93d08 100644 --- a/mlir/test/mlir-rocm-runner/vecadd.mlir +++ b/mlir/test/Integration/GPU/ROCM/vecadd.mlir @@ -1,5 +1,10 @@ -// RUN: mlir-rocm-runner %s \ -// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ +// RUN: mlir-opt %s \ +// RUN: -convert-scf-to-std \ +// RUN: -gpu-kernel-outlining \ +// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \ +// RUN: -gpu-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s diff --git a/mlir/test/mlir-rocm-runner/vector-transferops.mlir b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir similarity index 90% rename from mlir/test/mlir-rocm-runner/vector-transferops.mlir rename to mlir/test/Integration/GPU/ROCM/vector-transferops.mlir index eda541a2d814..c2807b64c064 100644 --- a/mlir/test/mlir-rocm-runner/vector-transferops.mlir +++ b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir @@ -1,5 +1,10 @@ -// RUN: mlir-rocm-runner %s \ -// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \ +// RUN: mlir-opt %s \ +// RUN: -convert-scf-to-std \ +// RUN: -gpu-kernel-outlining \ +// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl,gpu-to-hsaco)' \ +// RUN: -gpu-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp index 58e890b907a4..5a3cb33526f6 100644 --- a/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp +++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToHsaco.cpp @@ -6,11 +6,9 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" -#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/Dialect/GPU/Passes.h" + #include "mlir/Pass/Pass.h" -#include "mlir/Pass/PassManager.h" -#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" #include "mlir/Target/LLVMIR/Export.h" #include "llvm/Support/TargetSelect.h" @@ -18,38 +16,54 @@ using namespace mlir; #if MLIR_ROCM_CONVERSIONS_ENABLED -static OwnedBlob compileIsaToHsacoForTesting(const std::string &, Location, - StringRef) { - const char data[] = "HSACO"; - return std::make_unique>(data, data + sizeof(data) - 1); +namespace { +class TestSerializeToHsacoPass + : public PassWrapper { +public: + TestSerializeToHsacoPass(); + +private: + void getDependentDialects(DialectRegistry ®istry) const override; + + // Serializes ROCDL IR to HSACO. + std::unique_ptr> + serializeISA(const std::string &isa) override; +}; +} // namespace + +TestSerializeToHsacoPass::TestSerializeToHsacoPass() { + this->triple = "amdgcn-amd-amdhsa"; + this->chip = "gfx900"; +} + +void TestSerializeToHsacoPass::getDependentDialects( + DialectRegistry ®istry) const { + registerROCDLDialectTranslation(registry); + gpu::SerializeToBlobPass::getDependentDialects(registry); } -static std::unique_ptr -translateModuleToROCDL(Operation *m, llvm::LLVMContext &llvmContext, - StringRef moduleName) { - registerLLVMDialectTranslation(*m->getContext()); - registerROCDLDialectTranslation(*m->getContext()); - return translateModuleToLLVMIR(m, llvmContext, moduleName); +std::unique_ptr> +TestSerializeToHsacoPass::serializeISA(const std::string &) { + std::string data = "HSACO"; + return std::make_unique>(data.begin(), data.end()); } namespace mlir { namespace test { -void registerTestConvertGPUKernelToHsacoPass() { - PassPipelineRegistration<>( - "test-kernel-to-hsaco", - "Convert all kernel functions to ROCm hsaco blobs", - [](OpPassManager &pm) { +// Register test pass to serialize GPU module to a HSAco binary annotation. +void registerTestGpuSerializeToHsacoPass() { + PassRegistration registerSerializeToHsaco( + "test-gpu-to-hsaco", + "Lower GPU kernel function to HSAco binary annotations", [] { // Initialize LLVM AMDGPU backend. LLVMInitializeAMDGPUTarget(); LLVMInitializeAMDGPUTargetInfo(); LLVMInitializeAMDGPUTargetMC(); LLVMInitializeAMDGPUAsmPrinter(); - pm.addPass(createConvertGPUKernelToBlobPass( - translateModuleToROCDL, compileIsaToHsacoForTesting, - "amdgcn-amd-amdhsa", "gfx900", "-code-object-v3", "rocdl.hsaco")); + return std::make_unique(); }); } } // namespace test } // namespace mlir -#endif +#endif // MLIR_ROCM_CONVERSIONS_ENABLED diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py index 4ba36202578d..199d7222e1cf 100644 --- a/mlir/test/lit.cfg.py +++ b/mlir/test/lit.cfg.py @@ -77,7 +77,6 @@ tools.extend([ ToolSubst('toy-ch5', unresolved='ignore'), ToolSubst('%linalg_test_lib_dir', config.linalg_test_lib_dir, unresolved='ignore'), ToolSubst('%mlir_runner_utils_dir', config.mlir_runner_utils_dir, unresolved='ignore'), - ToolSubst('%rocm_wrapper_library_dir', config.rocm_wrapper_library_dir, unresolved='ignore'), ToolSubst('%spirv_wrapper_library_dir', config.spirv_wrapper_library_dir, unresolved='ignore'), ToolSubst('%vulkan_wrapper_library_dir', config.vulkan_wrapper_library_dir, unresolved='ignore'), ToolSubst('%mlir_integration_test_dir', config.mlir_integration_test_dir, unresolved='ignore'), diff --git a/mlir/test/lit.site.cfg.py.in b/mlir/test/lit.site.cfg.py.in index 0015c1369d7a..dbc8460df576 100644 --- a/mlir/test/lit.site.cfg.py.in +++ b/mlir/test/lit.site.cfg.py.in @@ -39,7 +39,6 @@ config.build_examples = @LLVM_BUILD_EXAMPLES@ config.run_cuda_tests = @MLIR_CUDA_CONVERSIONS_ENABLED@ config.enable_cuda_runner = @MLIR_CUDA_RUNNER_ENABLED@ config.run_rocm_tests = @MLIR_ROCM_CONVERSIONS_ENABLED@ -config.rocm_wrapper_library_dir = "@MLIR_ROCM_WRAPPER_LIBRARY_DIR@" config.enable_rocm_runner = @MLIR_ROCM_RUNNER_ENABLED@ config.spirv_wrapper_library_dir = "@MLIR_SPIRV_WRAPPER_LIBRARY_DIR@" config.enable_spirv_cpu_runner = @MLIR_SPIRV_CPU_RUNNER_ENABLED@ diff --git a/mlir/tools/CMakeLists.txt b/mlir/tools/CMakeLists.txt index 37793ce65ab1..ac9ca8167320 100644 --- a/mlir/tools/CMakeLists.txt +++ b/mlir/tools/CMakeLists.txt @@ -1,7 +1,6 @@ add_subdirectory(mlir-cpu-runner) add_subdirectory(mlir-opt) add_subdirectory(mlir-reduce) -add_subdirectory(mlir-rocm-runner) add_subdirectory(mlir-shlib) add_subdirectory(mlir-spirv-cpu-runner) add_subdirectory(mlir-translate) diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index 241cee572cb1..428b3d506317 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -65,7 +65,7 @@ void registerTestCallGraphPass(); void registerTestConstantFold(); void registerTestConvVectorization(); void registerTestGpuSerializeToCubinPass(); -void registerTestConvertGPUKernelToHsacoPass(); +void registerTestGpuSerializeToHsacoPass(); void registerTestDataLayoutQuery(); void registerTestDecomposeCallGraphTypes(); void registerTestDialect(DialectRegistry &); @@ -140,7 +140,7 @@ void registerTestPasses() { test::registerTestGpuSerializeToCubinPass(); #endif #if MLIR_ROCM_CONVERSIONS_ENABLED - test::registerTestConvertGPUKernelToHsacoPass(); + test::registerTestGpuSerializeToHsacoPass(); #endif test::registerTestConvVectorization(); test::registerTestDecomposeCallGraphTypes(); diff --git a/mlir/tools/mlir-rocm-runner/CMakeLists.txt b/mlir/tools/mlir-rocm-runner/CMakeLists.txt deleted file mode 100644 index d2381413e158..000000000000 --- a/mlir/tools/mlir-rocm-runner/CMakeLists.txt +++ /dev/null @@ -1,127 +0,0 @@ -set(LLVM_OPTIONAL_SOURCES - rocm-runtime-wrappers.cpp - mlir-rocm-runner.cpp - ) - -if(MLIR_ROCM_RUNNER_ENABLED) - if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD)) - message(SEND_ERROR - "Building the mlir rocm runner requires the AMDGPU backend") - endif() - - # Ensure lld is enabled. - if (NOT "lld" IN_LIST LLVM_ENABLE_PROJECTS) - message(SEND_ERROR "lld is not enabled. Please revise LLVM_ENABLE_PROJECTS") - endif() - - # lld header files. - include_directories(${MLIR_SOURCE_DIR}/../lld/include) - - # Configure ROCm support. - if (NOT DEFINED ROCM_PATH) - if (NOT DEFINED ENV{ROCM_PATH}) - set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed") - else() - set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to which ROCm has been installed") - endif() - set(HIP_PATH "${ROCM_PATH}/hip" CACHE PATH " Path to which HIP has been installed") - endif() - set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) - find_package(HIP) - if (NOT HIP_FOUND) - message(SEND_ERROR "Build the mlir rocm runner requires a working ROCm and HIP install") - else() - message(STATUS "ROCm HIP version: ${HIP_VERSION}") - endif() - - # Set compile-time flags for ROCm path. - add_definitions(-D__ROCM_PATH__="${ROCM_PATH}") - - # Locate HIP runtime library. - find_library(ROCM_RUNTIME_LIBRARY amdhip64 - PATHS "${HIP_PATH}/lib") - if (NOT ROCM_RUNTIME_LIBRARY) - message(SEND_ERROR "Could not locate ROCm HIP runtime library") - else() - message(STATUS "ROCm HIP runtime lib: ${ROCM_RUNTIME_LIBRARY}") - endif() - - # Set HIP compile-time flags. - add_definitions(-D__HIP_PLATFORM_HCC__) - - add_mlir_library(rocm-runtime-wrappers - SHARED - rocm-runtime-wrappers.cpp - - EXCLUDE_FROM_LIBMLIR - ) - target_include_directories(rocm-runtime-wrappers - PRIVATE - "${HIP_PATH}/../include" - "${HIP_PATH}/include" - ) - target_link_libraries(rocm-runtime-wrappers - PRIVATE - ${ROCM_RUNTIME_LIBRARY} - ) - - get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) - set(LIBS - ${conversion_libs} - lldCommon - lldDriver - lldELF - MLIRJitRunner - MLIRAnalysis - MLIREDSC - MLIRExecutionEngine - MLIRGPU - MLIRIR - MLIRLLVMIR - MLIRLLVMToLLVMIRTranslation - MLIRParser - MLIRROCDLIR - MLIRStandard - MLIRSupport - MLIRTargetLLVMIRExport - MLIRROCDLToLLVMIRTranslation - MLIRTransforms - MLIRTranslation - ${ROCM_RUNTIME_LIBRARY} - ) - - # Manually expand the target library, since our MLIR libraries - # aren't plugged into the LLVM dependency tracking. If we don't - # do this then we can't insert the CodeGen library after ourselves - llvm_expand_pseudo_components(TARGET_LIBS AllTargetsCodeGens AllTargetsAsmParsers) - # Prepend LLVM in front of every target, this is how the library - # are named with CMake - SET(targets_to_link) - FOREACH(t ${TARGET_LIBS}) - LIST(APPEND targets_to_link "LLVM${t}") - ENDFOREACH(t) - - add_llvm_tool(mlir-rocm-runner - mlir-rocm-runner.cpp - - DEPENDS - rocm-runtime-wrappers - - LINK_COMPONENTS - - Core - LTO - MC - MCParser - Option - Support - ) - llvm_update_compile_flags(mlir-rocm-runner) - target_include_directories(mlir-rocm-runner - PRIVATE - "${HIP_PATH}/../include" - "${HIP_PATH}/include" - ) - target_link_libraries(mlir-rocm-runner PRIVATE ${LIBS} ${targets_to_link}) - -endif() diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp deleted file mode 100644 index c2f9abbf73c8..000000000000 --- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp +++ /dev/null @@ -1,349 +0,0 @@ -//===- mlir-rocm-runner.cpp - MLIR ROCM Execution Driver-------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This is a command line utility that executes an MLIR file on the GPU by -// translating MLIR to ROCDL/LLVM IR before JIT-compiling and executing the -// latter. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/STLExtras.h" - -#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" -#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" -#include "mlir/Conversion/SCFToStandard/SCFToStandard.h" -#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" -#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" -#include "mlir/Dialect/GPU/GPUDialect.h" -#include "mlir/Dialect/GPU/Passes.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/LLVMIR/ROCDLDialect.h" -#include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/ExecutionEngine/JitRunner.h" -#include "mlir/ExecutionEngine/OptUtils.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Pass/PassManager.h" -#include "mlir/Support/FileUtilities.h" -#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h" -#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" -#include "mlir/Target/LLVMIR/Export.h" -#include "mlir/Transforms/DialectConversion.h" -#include "mlir/Transforms/Passes.h" -#include "llvm/Support/ErrorOr.h" -#include "llvm/Support/FileUtilities.h" -#include "llvm/Support/InitLLVM.h" -#include "llvm/Support/LineIterator.h" -#include "llvm/Support/Program.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" - -// MC headers. -#include "llvm/MC/MCAsmBackend.h" -#include "llvm/MC/MCAsmInfo.h" -#include "llvm/MC/MCCodeEmitter.h" -#include "llvm/MC/MCContext.h" -#include "llvm/MC/MCInstPrinter.h" -#include "llvm/MC/MCInstrInfo.h" -#include "llvm/MC/MCObjectFileInfo.h" -#include "llvm/MC/MCObjectWriter.h" -#include "llvm/MC/MCParser/AsmLexer.h" -#include "llvm/MC/MCParser/MCTargetAsmParser.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCStreamer.h" -#include "llvm/MC/MCSubtargetInfo.h" -#include "llvm/MC/MCTargetOptionsCommandFlags.h" - -// lld headers. -#include "lld/Common/Driver.h" - -// HIP headers. -#include "hip/hip_version.h" - -#include - -using namespace mlir; -using namespace llvm; - -using Blob = SmallVector; - -static cl::opt tripleName("triple", cl::desc("target triple"), - cl::value_desc("triple string"), - cl::init("amdgcn-amd-amdhsa")); - -static cl::opt targetChip("target", cl::desc("target chip"), - cl::value_desc("AMDGPU ISA version"), - cl::init("")); - -static cl::opt features("feature", cl::desc("target features"), - cl::value_desc("AMDGPU target features"), - cl::init("")); - -static constexpr const char kRunnerProgram[] = "mlir-rocm-runner"; -static constexpr const char kRocmAgentEnumerator[] = "rocm_agent_enumerator"; -static constexpr const char kDefaultTargetChip[] = "gfx900"; - -static LogicalResult assembleIsa(const std::string isa, StringRef name, - Blob &result) { - raw_svector_ostream os(result); - - std::string error; - Triple theTriple(Triple::normalize(tripleName)); - const Target *theTarget = - TargetRegistry::lookupTarget(theTriple.normalize(), error); - if (!theTarget) { - WithColor::error(errs(), name) << error; - return failure(); - } - - SourceMgr srcMgr; - srcMgr.AddNewSourceBuffer(MemoryBuffer::getMemBuffer(isa), SMLoc()); - - const MCTargetOptions mcOptions; - std::unique_ptr mri(theTarget->createMCRegInfo(tripleName)); - std::unique_ptr mai( - theTarget->createMCAsmInfo(*mri, tripleName, mcOptions)); - mai->setRelaxELFRelocations(true); - - MCObjectFileInfo mofi; - MCContext ctx(mai.get(), mri.get(), &mofi, &srcMgr, &mcOptions); - mofi.InitMCObjectFileInfo(theTriple, false, ctx, false); - - SmallString<128> cwd; - if (!sys::fs::current_path(cwd)) - ctx.setCompilationDir(cwd); - - std::unique_ptr mcStreamer; - std::unique_ptr mcii(theTarget->createMCInstrInfo()); - std::unique_ptr sti( - theTarget->createMCSubtargetInfo(tripleName, targetChip, features)); - - MCCodeEmitter *ce = theTarget->createMCCodeEmitter(*mcii, *mri, ctx); - MCAsmBackend *mab = theTarget->createMCAsmBackend(*sti, *mri, mcOptions); - mcStreamer.reset(theTarget->createMCObjectStreamer( - theTriple, ctx, std::unique_ptr(mab), - mab->createObjectWriter(os), std::unique_ptr(ce), *sti, - mcOptions.MCRelaxAll, mcOptions.MCIncrementalLinkerCompatible, - /*DWARFMustBeAtTheEnd*/ false)); - mcStreamer->setUseAssemblerInfoForParsing(true); - - std::unique_ptr parser( - createMCAsmParser(srcMgr, ctx, *mcStreamer, *mai)); - std::unique_ptr tap( - theTarget->createMCAsmParser(*sti, *parser, *mcii, mcOptions)); - - if (!tap) { - WithColor::error(errs(), name) << "assembler initialization error.\n"; - return failure(); - } - - parser->setTargetParser(*tap); - parser->Run(false); - - return success(); -} - -static std::mutex mutex; -static LogicalResult createHsaco(const Blob &isaBlob, StringRef name, - Blob &hsacoBlob) { - // Save the ISA binary to a temp file. - int tempIsaBinaryFd = -1; - SmallString<128> tempIsaBinaryFilename; - std::error_code ec = sys::fs::createTemporaryFile( - "kernel", "o", tempIsaBinaryFd, tempIsaBinaryFilename); - if (ec) { - WithColor::error(errs(), name) - << "temporary file for ISA binary creation error.\n"; - return failure(); - } - FileRemover cleanupIsaBinary(tempIsaBinaryFilename); - raw_fd_ostream tempIsaBinaryOs(tempIsaBinaryFd, true); - tempIsaBinaryOs << isaBlob; - tempIsaBinaryOs.close(); - - // Create a temp file for HSA code object. - int tempHsacoFD = -1; - SmallString<128> tempHsacoFilename; - ec = sys::fs::createTemporaryFile("kernel", "hsaco", tempHsacoFD, - tempHsacoFilename); - if (ec) { - WithColor::error(errs(), name) - << "temporary file for HSA code object creation error.\n"; - return failure(); - } - FileRemover cleanupHsaco(tempHsacoFilename); - - const std::lock_guard lock(mutex); - // Invoke lld. Expect a true return value from lld. - bool ret = lld::elf::link({"ld.lld", "-shared", tempIsaBinaryFilename.c_str(), - "-o", tempHsacoFilename.c_str()}, - /*canEarlyExit=*/false, llvm::outs(), llvm::errs()); - if (!ret) { - WithColor::error(errs(), name) << "lld invocation error.\n"; - return failure(); - } - - // Load the HSA code object. - auto hsacoFile = mlir::openInputFile(tempHsacoFilename); - if (!hsacoFile) { - WithColor::error(errs(), name) - << "read HSA code object from temp file error.\n"; - return failure(); - } - hsacoBlob.assign(hsacoFile->getBuffer().begin(), - hsacoFile->getBuffer().end()); - - return success(); -} - -static std::unique_ptr -compileModuleToROCDLIR(Operation *m, llvm::LLVMContext &llvmContext, - StringRef name) { - auto llvmModule = translateModuleToROCDLIR(m, llvmContext, name); - // TODO: Link with ROCm-Device-Libs in case needed (ex: the Module - // depends on math functions). - return llvmModule; -} - -static OwnedBlob compileISAToHsaco(const std::string isa, Location loc, - StringRef name) { - // ISA -> ISA in binary form via MC. - // Use lld to create HSA code object. - Blob isaBlob; - Blob hsacoBlob; - - if (succeeded(assembleIsa(isa, name, isaBlob)) && - succeeded(createHsaco(isaBlob, name, hsacoBlob))) - return std::make_unique>(hsacoBlob.begin(), - hsacoBlob.end()); - - WithColor::error(errs(), name) << "producing HSA code object error.\n"; - return {}; -} - -static void configTargetChip() { - // Set targetChip to default value first. - targetChip = kDefaultTargetChip; - - // Locate rocm_agent_enumerator. - llvm::ErrorOr rocmAgentEnumerator = llvm::sys::findProgramByName( - kRocmAgentEnumerator, {__ROCM_PATH__ "/bin"}); - std::error_code ec; - if ((ec = rocmAgentEnumerator.getError())) { - WithColor::warning(errs(), kRunnerProgram) - << kRocmAgentEnumerator << " couldn't be located under " - << __ROCM_PATH__ << ", set target as " << kDefaultTargetChip << "\n"; - return; - } - - // Prepare temp file to hold the outputs. - int tempFd = -1; - SmallString<128> tempFilename; - ec = sys::fs::createTemporaryFile("rocm_agent", "txt", tempFd, tempFilename); - if (ec) { - WithColor::warning(errs(), kRunnerProgram) - << "temporary file for " << kRocmAgentEnumerator - << " creation error, set target as " << kDefaultTargetChip << "\n"; - return; - } - FileRemover cleanup(tempFilename); - - // Invoke rocm_agent_enumerator. - std::string errorMessage; - SmallVector args{"-t", "GPU"}; - Optional redirects[3] = {{""}, tempFilename.str(), {""}}; - int result = - llvm::sys::ExecuteAndWait(rocmAgentEnumerator.get(), args, llvm::None, - redirects, 0, 0, &errorMessage); - if (result) { - WithColor::warning(errs(), kRunnerProgram) - << kRocmAgentEnumerator << " invocation error: " << errorMessage - << ", set target as " << kDefaultTargetChip << "\n"; - return; - } - - // Load and parse the result. - auto gfxIsaList = mlir::openInputFile(tempFilename); - if (!gfxIsaList) { - WithColor::error(errs(), kRunnerProgram) - << "read ROCm agent list temp file error, set target as " - << kDefaultTargetChip << "\n"; - return; - } - for (line_iterator lines(*gfxIsaList); !lines.is_at_end(); ++lines) { - // Skip the line with content "gfx000". - if (*lines == "gfx000") - continue; - // Use the first ISA version found. - targetChip = lines->str(); - break; - } -} - -static void configTargetFeatures() { - if (features.size() > 0) - features += ","; - // After ROCm 3.5, adopt HSA code object V3. - if (HIP_VERSION_MAJOR >= 3 && HIP_VERSION_MINOR >= 5) - features += "+code-object-v3"; - else - features += "-code-object-v3"; -} - -static LogicalResult runMLIRPasses(ModuleOp m) { - PassManager pm(m.getContext()); - applyPassManagerCLOptions(pm); - - // Configure target chip ISA version if it has not been specified. - if (!targetChip.size()) - configTargetChip(); - - // Configure target features per ROCm / HIP version. - configTargetFeatures(); - - const char gpuBinaryAnnotation[] = "rocdl.hsaco"; - pm.addPass(createLowerToCFGPass()); - pm.addPass(createGpuKernelOutliningPass()); - auto &kernelPm = pm.nest(); - kernelPm.addPass(createStripDebugInfoPass()); - kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass()); - kernelPm.addPass(createConvertGPUKernelToBlobPass( - compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip, - features, gpuBinaryAnnotation)); - pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation)); - - return pm.run(m); -} - -int main(int argc, char **argv) { - registerPassManagerCLOptions(); - llvm::InitLLVM y(argc, argv); - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllAsmParsers(); - - // Initialize LLVM AMDGPU backend. - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTargetMC(); - LLVMInitializeAMDGPUAsmPrinter(); - - mlir::initializeLLVMPasses(); - - mlir::JitRunnerConfig jitRunnerConfig; - jitRunnerConfig.mlirTransformer = runMLIRPasses; - - mlir::DialectRegistry registry; - registry.insert(); - mlir::registerLLVMDialectTranslation(registry); - mlir::registerROCDLDialectTranslation(registry); - - return mlir::JitRunnerMain(argc, argv, registry, jitRunnerConfig); -} -- GitLab From a1d6c652e3a0fba31377474af2436c9a9ceac6cc Mon Sep 17 00:00:00 2001 From: Max Kazantsev Date: Fri, 19 Mar 2021 14:25:08 +0700 Subject: [PATCH 0118/1000] [Test] Precommit one more test --- .../IndVarSimplify/eliminate-comparison.ll | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll index c367176b4b59..48a51d723d24 100644 --- a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll +++ b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll @@ -1094,4 +1094,56 @@ exit: ret i32 %iv } +define i32 @func_28(i32 %start) { +; CHECK-LABEL: @func_28( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ] +; CHECK-NEXT: [[C1:%.*]] = icmp ne i32 [[IV]], 0 +; CHECK-NEXT: [[C2:%.*]] = icmp ne i32 [[IV]], 0 +; CHECK-NEXT: [[C3:%.*]] = icmp ne i32 [[IV]], 0 +; CHECK-NEXT: br i1 [[C1]], label [[CHECKED_1:%.*]], label [[FAIL:%.*]] +; CHECK: checked.1: +; CHECK-NEXT: br i1 [[C2]], label [[CHECKED_2:%.*]], label [[FAIL]] +; CHECK: checked.2: +; CHECK-NEXT: br i1 [[C3]], label [[BACKEDGE]], label [[FAIL]] +; CHECK: backedge: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 758394 +; CHECK-NEXT: [[LOOP_COND:%.*]] = call i1 @cond_func() +; CHECK-NEXT: br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]] +; CHECK: fail: +; CHECK-NEXT: unreachable +; CHECK: exit: +; CHECK-NEXT: [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ] +; CHECK-NEXT: ret i32 [[IV_LCSSA1]] +; +entry: + br label %loop + +loop: + %iv = phi i32 [%start, %entry], [%iv.next, %backedge] + %c1 = icmp ne i32 %iv, 0 + %c2 = icmp ne i32 %iv, 0 + %c3 = icmp ne i32 %iv, 0 + br i1 %c1, label %checked.1, label %fail + +checked.1: + br i1 %c2, label %checked.2, label %fail + +checked.2: + br i1 %c3, label %backedge, label %fail + +backedge: + %iv.next = add i32 %iv, 758394 + %loop.cond = call i1 @cond_func() + br i1 %loop.cond, label %loop, label %exit + +fail: + unreachable + +exit: + ret i32 %iv +} + !0 = !{i32 0, i32 2147483647} -- GitLab From d09adfd3993cbc1043b4d20232bce8bd774232cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 18 Mar 2021 09:44:01 +0200 Subject: [PATCH 0119/1000] [lit] Handle plain negations directly in the internal shell Keep running "not --crash" via the external "not" executable, but for plain negations, and for cases that use the shell "!" operator, just skip that argument and invert the return code. The libcxx tests only use the shell operator "!" for negations, never the "not" executable, because libcxx tests can be run without having a fully built llvm tree available providing the "not" executable. This allows using the internal shell for libcxx tests. Differential Revision: https://reviews.llvm.org/D98859 --- llvm/utils/lit/lit/TestRunner.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index f826bc91fb3e..820cbce962c1 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -608,6 +608,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): assert isinstance(cmd, ShUtil.Pipeline) procs = [] + negate_procs = [] default_stdin = subprocess.PIPE stderrTempFiles = [] opened_files = [] @@ -653,6 +654,12 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): if not args: raise InternalShellError(j, "Error: 'not' requires a" " subcommand") + elif args[0] == '!': + not_args.append(args.pop(0)) + not_count += 1 + if not args: + raise InternalShellError(j, "Error: '!' requires a" + " subcommand") else: break @@ -699,7 +706,15 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): # the assumptions that (1) environment variables are not intended to be # relevant to 'not' commands and (2) the 'env' command should always # blindly pass along the status it receives from any command it calls. - args = not_args + args + + # For plain negations, either 'not' without '--crash', or the shell + # operator '!', leave them out from the command to execute and + # invert the result code afterwards. + if not_crash: + args = not_args + args + not_count = 0 + else: + not_args = [] stdin, stdout, stderr = processRedirects(j, default_stdin, cmd_shenv, opened_files) @@ -763,6 +778,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): stderr = stderr, env = cmd_shenv.env, close_fds = kUseCloseFDs)) + negate_procs.append((not_count % 2) != 0) # Let the helper know about this process timeoutHelper.addProcess(procs[-1]) except OSError as e: @@ -815,6 +831,8 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): # Detect Ctrl-C in subprocess. if res == -signal.SIGINT: raise KeyboardInterrupt + if negate_procs[i]: + res = not res # Ensure the resulting output is always of string type. try: -- GitLab From c9fc1a979cbaf86b00e66140a235f5fdc1250bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 16 Mar 2021 13:19:11 +0200 Subject: [PATCH 0120/1000] [libcxx] [test] Explicitly check that some env vars are ignored in the temp_dir_path test This was suggested in the review of D98139. Differential Revision: https://reviews.llvm.org/D98696 --- .../temp_directory_path.pass.cpp | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.temp_dir_path/temp_directory_path.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.temp_dir_path/temp_directory_path.pass.cpp index 32748ded1428..28331c77b9a5 100644 --- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.temp_dir_path/temp_directory_path.pass.cpp +++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.temp_dir_path/temp_directory_path.pass.cpp @@ -64,6 +64,14 @@ TEST_CASE(basic_tests) {"TMP", env.create_dir("dir2")}, {"TEMP", env.create_dir("dir3")}, {"TEMPDIR", env.create_dir("dir4")} +#endif + }; + TestCase ignored_cases[] = { +#ifdef _WIN32 + {"TMPDIR", env.create_dir("dir5")}, + {"TEMPDIR", env.create_dir("dir6")}, +#else + {"USERPROFILE", env.create_dir("dir5")}, #endif }; for (auto& TC : cases) { @@ -114,6 +122,7 @@ TEST_CASE(basic_tests) UnsetEnv(TC.name); } // No env variables are defined + path fallback; { std::error_code ec = GetTestEC(); path ret = temp_directory_path(ec); @@ -123,6 +132,20 @@ TEST_CASE(basic_tests) TEST_CHECK(ret == "/tmp"); #endif TEST_CHECK(is_directory(ret)); + fallback = ret; + } + for (auto& TC : ignored_cases) { + // Check that certain variables are ignored + PutEnv(TC.name, TC.p); + std::error_code ec = GetTestEC(); + path ret = temp_directory_path(ec); + TEST_CHECK(!ec); + + // Check that we return the same as above when no vars were defined. + TEST_CHECK(ret == fallback); + + // Finally erase this env variable + UnsetEnv(TC.name); } } -- GitLab From 9de63b2e051cb3e79645cc20b83b4d33d132cba0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 18 Mar 2021 14:08:10 +0200 Subject: [PATCH 0121/1000] [lit] Pass the USERPROFILE variable through on Windows When running in a Windows Container, the Git for Windows Unix tools (C:\Program Files\Git\usr\bin) just hang if this variable isn't passed through. Currently, running the LLVM/clang tests in a Windows Container fails if that directory is added to the path, but succeeds after this change. (After this change, the previously used GnuWin tools can be left out entirely, too, as lit automatically picks up the Git for Windows tools if necessary.) Differential Revision: https://reviews.llvm.org/D98858 --- llvm/utils/lit/lit/TestingConfig.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/lit/lit/TestingConfig.py b/llvm/utils/lit/lit/TestingConfig.py index 612db574677e..e1d13aa3a064 100644 --- a/llvm/utils/lit/lit/TestingConfig.py +++ b/llvm/utils/lit/lit/TestingConfig.py @@ -33,6 +33,7 @@ class TestingConfig(object): pass_vars.append('INCLUDE') pass_vars.append('LIB') pass_vars.append('PATHEXT') + pass_vars.append('USERPROFILE') environment['PYTHONBUFFERED'] = '1' for var in pass_vars: -- GitLab From 926cca9679fb27eb1db9f27a5dfa902d42f968b5 Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Fri, 19 Mar 2021 08:47:28 +0100 Subject: [PATCH 0122/1000] [InstCombine] Add unit test with @llvm.annotation. In preparation for https://reviews.llvm.org/D98925 --- .../InstCombine/annotation-intrinsic.ll | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/annotation-intrinsic.ll diff --git a/llvm/test/Transforms/InstCombine/annotation-intrinsic.ll b/llvm/test/Transforms/InstCombine/annotation-intrinsic.ll new file mode 100644 index 000000000000..bfc7649bbab0 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/annotation-intrinsic.ll @@ -0,0 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -instcombine %s -S -o - | FileCheck %s + +; This tests that llvm.annotation does not prevent load combining. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev4-linux-gnu" + +declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1 + +define dso_local i32 @annotated(i32* %c) local_unnamed_addr #0 { +; CHECK-LABEL: @annotated( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.annotation.i32(i32 [[TMP0]], i8* undef, i8* undef, i32 undef) +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[C]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret i32 [[ADD]] +; +entry: + %0 = load i32, i32* %c, align 4 + %1 = call i32 @llvm.annotation.i32(i32 %0, i8* undef, i8* undef, i32 undef) + %2 = load i32, i32* %c, align 4 + %add = add nsw i32 %1, %2 + ret i32 %add +} + +attributes #0 = { nofree nounwind uwtable willreturn mustprogress } -- GitLab From 6d22ba48ea492c2d1244c22d0e8dfb7a1fb80ff5 Mon Sep 17 00:00:00 2001 From: Mikael Holmen Date: Fri, 19 Mar 2021 09:26:14 +0100 Subject: [PATCH 0123/1000] [NVPTX] Fix warning, remove extra ";" [NFC] gcc complained with ../lib/Target/NVPTX/NVPTXLowerArgs.cpp:203:2: warning: extra ';' [-Wpedantic] 203 | }; | ^ --- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 56643f64e6c2..0143f4f4b62a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -200,7 +200,7 @@ static bool isALoadChain(Value *Start) { return false; } return true; -}; +} void NVPTXLowerArgs::handleByValParam(Argument *Arg) { Function *Func = Arg->getParent(); -- GitLab From 74ffe8dc590c29f1895e7b9cabf13944ffef16cb Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Fri, 19 Mar 2021 09:27:55 +0100 Subject: [PATCH 0124/1000] [mlir] Remove ConvertKernelFuncToBlob All users have been converted to gpu::SerializeToBlobPass. Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D98928 --- .../mlir/Conversion/GPUCommon/GPUCommonPass.h | 31 ------- mlir/lib/Conversion/GPUCommon/CMakeLists.txt | 1 - .../GPUCommon/ConvertKernelFuncToBlob.cpp | 93 ------------------- 3 files changed, 125 deletions(-) delete mode 100644 mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h index fb5e8202df63..173d8feced35 100644 --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -60,37 +60,6 @@ createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation = {}); void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, OwningRewritePatternList &patterns, StringRef gpuBinaryAnnotation = {}); - -/// Creates a pass to convert kernel functions into GPU target object blobs. -/// -/// This transformation takes the body of each function that is annotated with -/// the 'gpu.kernel' attribute, copies it to a new LLVM module, compiles the -/// module with help of the GPU backend to target object and then invokes -/// the provided blobGenerator to produce a binary blob. Such blob is then -/// attached as a string attribute to the kernel function. -/// -/// Following callbacks are to be provided by user: -/// - loweringCallback : lower the module to an LLVM module. -/// - blobGenerator : build a blob executable on target GPU. -/// -/// Information wrt LLVM backend are to be supplied by user: -/// - triple : target triple to be used. -/// - targetChip : mcpu to be used. -/// - features : target-specific features to be used. -/// -/// Information about result attribute is to be specified by user: -/// - gpuBinaryAnnotation : the name of the attribute which contains the blob. -/// -/// After the transformation, the body of the kernel function is removed (i.e., -/// it is turned into a declaration). -/// -/// A non-empty gpuBinaryAnnotation overrides the pass' command line option. -std::unique_ptr> -createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback, - BlobGenerator blobGenerator, StringRef triple, - StringRef targetChip, StringRef features, - StringRef gpuBinaryAnnotation = {}); - } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt index d9f6867556c6..04ff2a994091 100644 --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -16,7 +16,6 @@ endif() add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms ConvertLaunchFuncToRuntimeCalls.cpp - ConvertKernelFuncToBlob.cpp GPUOpsLowering.cpp DEPENDS diff --git a/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp b/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp deleted file mode 100644 index e8f9a7a46936..000000000000 --- a/mlir/lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp +++ /dev/null @@ -1,93 +0,0 @@ -//===- ConvertKernelFuncToBlob.cpp - MLIR GPU lowering passes -------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements a pass to convert gpu kernel functions into a -// corresponding binary blob that can be executed on a GPU. Currently -// only translates the function itself but no dependencies. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" - -#include "mlir/Dialect/GPU/GPUDialect.h" -#include "mlir/Dialect/GPU/Passes.h" -#include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/IR/Attributes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/BuiltinOps.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Pass/PassRegistry.h" -#include "mlir/Support/LogicalResult.h" - -#include "llvm/ADT/Optional.h" -#include "llvm/ADT/Twine.h" -#include "llvm/Support/Error.h" -#include "llvm/Support/Mutex.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Support/TargetSelect.h" - -using namespace mlir; - -namespace { - -/// A pass converting tagged kernel modules to a blob with target instructions. -/// -/// If tagged as a kernel module, each contained function is translated to -/// user-specified IR. A user provided BlobGenerator then compiles the IR to -/// GPU binary code, which is then attached as an attribute to the function. -/// The function body is erased. -class GpuKernelToBlobPass - : public PassWrapper { -public: - GpuKernelToBlobPass(LoweringCallback loweringCallback, - BlobGenerator blobGenerator, StringRef triple, - StringRef targetChip, StringRef features, - StringRef gpuBinaryAnnotation) - : loweringCallback(loweringCallback), blobGenerator(blobGenerator) { - if (!triple.empty()) - this->triple = triple.str(); - if (!targetChip.empty()) - this->chip = targetChip.str(); - if (!features.empty()) - this->features = features.str(); - if (!gpuBinaryAnnotation.empty()) - this->gpuBinaryAnnotation = gpuBinaryAnnotation.str(); - } - -private: - // Translates the 'getOperation()' result to an LLVM module. - // Note: when this class is removed, this function no longer needs to be - // virtual. - std::unique_ptr - translateToLLVMIR(llvm::LLVMContext &llvmContext) override { - return loweringCallback(getOperation(), llvmContext, "LLVMDialectModule"); - } - - // Serializes the target ISA to binary form. - std::unique_ptr> - serializeISA(const std::string &isa) override { - return blobGenerator(isa, getOperation().getLoc(), - getOperation().getName()); - } - - LoweringCallback loweringCallback; - BlobGenerator blobGenerator; -}; - -} // anonymous namespace - -std::unique_ptr> -mlir::createConvertGPUKernelToBlobPass(LoweringCallback loweringCallback, - BlobGenerator blobGenerator, - StringRef triple, StringRef targetChip, - StringRef features, - StringRef gpuBinaryAnnotation) { - return std::make_unique(loweringCallback, blobGenerator, - triple, targetChip, features, - gpuBinaryAnnotation); -} -- GitLab From 628f5c9da29b64777b96cb6787c06b14d288a792 Mon Sep 17 00:00:00 2001 From: Alexander Belyaev Date: Thu, 18 Mar 2021 22:35:48 +0100 Subject: [PATCH 0125/1000] [mlir] Add a roundtrip test for 'linalg.tiled_loop' on buffers. https://llvm.discourse.group/t/rfc-add-linalg-tileop/2833 Differential Revision: https://reviews.llvm.org/D98900 --- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 4 +- mlir/test/Dialect/Linalg/roundtrip.mlir | 60 ++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 13cca7f19ee7..f456c588ffaf 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -1863,7 +1863,9 @@ static ParseResult parseTiledLoopOp(OpAsmParser &parser, if (parser.resolveOperands(outputs, outputTypes, outputsOperandsLoc, result.operands)) return failure(); - result.addTypes(outputTypes); + for (Type outputType : outputTypes) + if (outputType.isa()) + result.addTypes(outputType); } // Parse attributes. diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir index ab2547952cfb..084b8a339c0d 100644 --- a/mlir/test/Dialect/Linalg/roundtrip.mlir +++ b/mlir/test/Dialect/Linalg/roundtrip.mlir @@ -6,6 +6,8 @@ // Test that we can lower all the way to LLVM without crashing, don't check results here. // DISABLED: mlir-opt %s --convert-linalg-to-llvm -o=/dev/null 2>&1 +// CHECK-DAG: #[[$id_2d:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)> +// CHECK-DAG: #[[$id_1d:.*]] = affine_map<(d0, d1, d2) -> (d1)> // CHECK-DAG: #[[$permute_0:.*]] = affine_map<(d0, d1, d2) -> (d0, d2, d1)> // CHECK-DAG: #[[$permute_1:.*]] = affine_map<(d0, d1, d2) -> (d2, d1, d0)> // CHECK-DAG: #[[$reshape5D01:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1)> @@ -881,3 +883,61 @@ func @tiled_loop_reduction(%input_3d: tensor<16x24x32xf32>, } // CHECK-LABEL: func @tiled_loop_reduction // CHECK: iterators[ + +// ----- + +#trait_6 = { + indexing_maps = [ + #id_3d, + #id_2d, + #id_1d, + #id_1d + ], + iterator_types = ["reduction", "parallel", "reduction"] +} +#map_1 = affine_map<(d0, d1, d2)[s0] -> (d0 * 768 + s0 + d1 * 32 + d2)> +#map_2 = affine_map<(d0, d1)[s0] -> (d0 * 32 + s0 + d1)> +#map_3 = affine_map<(d0)[s0] -> (d0 + s0)> + +func @tiled_loop_on_buffers(%input_3d: memref<16x24x32xf32>, + %input_2d: memref<16x32xf32>, + %input_1d: memref<24xf32>, + %output: memref<24xf32>) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c4 = constant 4 : index + %c8 = constant 8 : index + %X = memref.dim %input_3d, %c0 : memref<16x24x32xf32> + %Y = memref.dim %input_3d, %c1 : memref<16x24x32xf32> + %Z = memref.dim %input_3d, %c2 : memref<16x24x32xf32> + linalg.tiled_loop (%i, %j, %k) = (%c0, %c0, %c0) + to (%X, %Y, %Z) step (%c2, %c4, %c8) + ins(%input_3d, %input_2d: memref<16x24x32xf32>, memref<16x32xf32>) + outs( %output: memref<24xf32>) + iterators["reduction", "parallel", "reduction"] { + %sub_3d = memref.subview %input_3d[%i, %j, %k][2, 4, 8][1, 1, 1] + : memref<16x24x32xf32> to memref<2x4x8xf32, #map_1> + %sub_2d = memref.subview %input_2d[%i, %k][2, 8][1, 1] + : memref<16x32xf32> to memref<2x8xf32, #map_2> + %sub_1d = memref.subview %input_1d[%j] [4] [1] + : memref<24xf32> to memref<4xf32, #map_3> + %sub_out = memref.subview %output[%j] [4] [1] + : memref<24xf32> to memref<4xf32, #map_3> + linalg.generic #trait_6 + ins(%sub_3d, %sub_2d, %sub_1d + : memref<2x4x8xf32, #map_1>, + memref<2x8xf32, #map_2>, + memref<4xf32, #map_3>) + outs(%sub_out : memref<4xf32, #map_3>) { + ^bb0(%i3d: f32, %i2d: f32, %i1d: f32, %o: f32): + %0 = addf %i3d, %i2d : f32 + %1 = addf %0, %i1d : f32 + linalg.yield %1 : f32 + } + linalg.yield + } + return +} +// CHECK-LABEL: func @tiled_loop_on_buffers +// CHECK: iterators[ -- GitLab From a96897219daf43a1b90e1e0c9dbf20167c0c16af Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 18 Mar 2021 22:36:01 +0000 Subject: [PATCH 0126/1000] [KnownBits] Add knownbits analysis for mulhs/mulu 'multiply high' instructions Split off from D98857 https://reviews.llvm.org/D98866 --- llvm/include/llvm/Support/KnownBits.h | 6 ++++++ llvm/lib/Support/KnownBits.cpp | 18 ++++++++++++++++++ llvm/unittests/Support/KnownBitsTest.cpp | 20 +++++++++++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/Support/KnownBits.h b/llvm/include/llvm/Support/KnownBits.h index d854aadbd430..c27ddb0ce804 100644 --- a/llvm/include/llvm/Support/KnownBits.h +++ b/llvm/include/llvm/Support/KnownBits.h @@ -296,6 +296,12 @@ public: /// Compute known bits resulting from multiplying LHS and RHS. static KnownBits computeForMul(const KnownBits &LHS, const KnownBits &RHS); + /// Compute known bits from sign-extended multiply-hi. + static KnownBits mulhs(const KnownBits &LHS, const KnownBits &RHS); + + /// Compute known bits from zero-extended multiply-hi. + static KnownBits mulhu(const KnownBits &LHS, const KnownBits &RHS); + /// Compute known bits for udiv(LHS, RHS). static KnownBits udiv(const KnownBits &LHS, const KnownBits &RHS); diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp index 6c7aaad968f5..423a908eed57 100644 --- a/llvm/lib/Support/KnownBits.cpp +++ b/llvm/lib/Support/KnownBits.cpp @@ -489,6 +489,24 @@ KnownBits KnownBits::computeForMul(const KnownBits &LHS, const KnownBits &RHS) { return Res; } +KnownBits KnownBits::mulhs(const KnownBits &LHS, const KnownBits &RHS) { + unsigned BitWidth = LHS.getBitWidth(); + assert(BitWidth == RHS.getBitWidth() && !LHS.hasConflict() && + !RHS.hasConflict() && "Operand mismatch"); + KnownBits WideLHS = LHS.sext(2 * BitWidth); + KnownBits WideRHS = RHS.sext(2 * BitWidth); + return computeForMul(WideLHS, WideRHS).extractBits(BitWidth, BitWidth); +} + +KnownBits KnownBits::mulhu(const KnownBits &LHS, const KnownBits &RHS) { + unsigned BitWidth = LHS.getBitWidth(); + assert(BitWidth == RHS.getBitWidth() && !LHS.hasConflict() && + !RHS.hasConflict() && "Operand mismatch"); + KnownBits WideLHS = LHS.zext(2 * BitWidth); + KnownBits WideRHS = RHS.zext(2 * BitWidth); + return computeForMul(WideLHS, WideRHS).extractBits(BitWidth, BitWidth); +} + KnownBits KnownBits::udiv(const KnownBits &LHS, const KnownBits &RHS) { unsigned BitWidth = LHS.getBitWidth(); assert(!LHS.hasConflict() && !RHS.hasConflict()); diff --git a/llvm/unittests/Support/KnownBitsTest.cpp b/llvm/unittests/Support/KnownBitsTest.cpp index 4e69df49837e..5f2133165d24 100644 --- a/llvm/unittests/Support/KnownBitsTest.cpp +++ b/llvm/unittests/Support/KnownBitsTest.cpp @@ -113,6 +113,8 @@ TEST(KnownBitsTest, BinaryExhaustive) { KnownBits KnownSMax(KnownAnd); KnownBits KnownSMin(KnownAnd); KnownBits KnownMul(KnownAnd); + KnownBits KnownMulHS(KnownAnd); + KnownBits KnownMulHU(KnownAnd); KnownBits KnownUDiv(KnownAnd); KnownBits KnownURem(KnownAnd); KnownBits KnownSRem(KnownAnd); @@ -156,6 +158,14 @@ TEST(KnownBitsTest, BinaryExhaustive) { KnownMul.One &= Res; KnownMul.Zero &= ~Res; + Res = (N1.sext(2 * Bits) * N2.sext(2 * Bits)).extractBits(Bits, Bits); + KnownMulHS.One &= Res; + KnownMulHS.Zero &= ~Res; + + Res = (N1.zext(2 * Bits) * N2.zext(2 * Bits)).extractBits(Bits, Bits); + KnownMulHU.One &= Res; + KnownMulHU.Zero &= ~Res; + if (!N2.isNullValue()) { Res = N1.udiv(N2); KnownUDiv.One &= Res; @@ -218,12 +228,20 @@ TEST(KnownBitsTest, BinaryExhaustive) { EXPECT_EQ(KnownSMin.Zero, ComputedSMin.Zero); EXPECT_EQ(KnownSMin.One, ComputedSMin.One); - // ComputedMul is conservatively correct, but not guaranteed to be + // The following are conservatively correct, but not guaranteed to be // precise. KnownBits ComputedMul = KnownBits::computeForMul(Known1, Known2); EXPECT_TRUE(ComputedMul.Zero.isSubsetOf(KnownMul.Zero)); EXPECT_TRUE(ComputedMul.One.isSubsetOf(KnownMul.One)); + KnownBits ComputedMulHS = KnownBits::mulhs(Known1, Known2); + EXPECT_TRUE(ComputedMulHS.Zero.isSubsetOf(KnownMulHS.Zero)); + EXPECT_TRUE(ComputedMulHS.One.isSubsetOf(KnownMulHS.One)); + + KnownBits ComputedMulHU = KnownBits::mulhu(Known1, Known2); + EXPECT_TRUE(ComputedMulHU.Zero.isSubsetOf(KnownMulHU.Zero)); + EXPECT_TRUE(ComputedMulHU.One.isSubsetOf(KnownMulHU.One)); + KnownBits ComputedUDiv = KnownBits::udiv(Known1, Known2); EXPECT_TRUE(ComputedUDiv.Zero.isSubsetOf(KnownUDiv.Zero)); EXPECT_TRUE(ComputedUDiv.One.isSubsetOf(KnownUDiv.One)); -- GitLab From 1d7cf550721c51030144f3cd295c5789d51c4aad Mon Sep 17 00:00:00 2001 From: Kristof Beyls Date: Fri, 19 Mar 2021 10:27:34 +0100 Subject: [PATCH 0127/1000] [docs] Add calendar info for SVE sync-ups --- llvm/docs/GettingInvolved.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst index c6856cc77eb1..e177cb695282 100644 --- a/llvm/docs/GettingInvolved.rst +++ b/llvm/docs/GettingInvolved.rst @@ -157,7 +157,8 @@ writing, the following sync-ups are organized: - * - Scalable Vectors and Arm SVE - Monthly, every 3rd Tuesday - - + - `ics `__ + `gcal `__ - `Minutes/docs `__ * - ML Guided Compiler Optimizations - Monthly -- GitLab From 7dd76cccca02ee59588647f2d97f1b554c48f580 Mon Sep 17 00:00:00 2001 From: Muhammad Omair Javaid Date: Fri, 19 Mar 2021 15:29:53 +0500 Subject: [PATCH 0128/1000] [LLDB] Skip TestExitDuringExpression on aarch64/linux buildbot TestExitDuringExpression test_exit_before_one_thread_unwind fails sporadically on both Arm and AArch64 linux buildbots. This seems like a thread timing issue. I am marking it skip for now. --- .../thread/exit_during_expression/TestExitDuringExpression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/API/functionalities/thread/exit_during_expression/TestExitDuringExpression.py b/lldb/test/API/functionalities/thread/exit_during_expression/TestExitDuringExpression.py index 4ee65c85e8f1..dafc0a967605 100644 --- a/lldb/test/API/functionalities/thread/exit_during_expression/TestExitDuringExpression.py +++ b/lldb/test/API/functionalities/thread/exit_during_expression/TestExitDuringExpression.py @@ -15,7 +15,7 @@ class TestExitDuringExpression(TestBase): NO_DEBUG_INFO_TESTCASE = True @skipIfWindows - @skipIf(oslist=["linux"], archs=["arm"], bugnumber="llvm.org/pr48414") + @skipIf(oslist=["linux"], archs=["arm", "aarch64"], bugnumber="llvm.org/pr48414") @expectedFailureAll(oslist=["freebsd"], bugnumber="llvm.org/pr48414") @expectedFailureNetBSD def test_exit_before_one_thread_unwind(self): -- GitLab From f3dd783b239f5587213d528dc642b599f43452b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 19 Mar 2021 12:30:08 +0200 Subject: [PATCH 0129/1000] Revert "[lit] Handle plain negations directly in the internal shell" This reverts commit d09adfd3993cbc1043b4d20232bce8bd774232cc. That commit caused failures in clang-tidy/infrastructure/validate-check-names.cpp on windows buildbots. That change exposed a surprising issue, not directly related to this change in itself, but in how TestRunner quotes command line arguments that later are going to be interpreted by a msys based tool (like grep.exe, when provided by Git for Windows). This worked accidentally before, when grep was invoked via not.exe which took a more conservative approach to windows argument quoting. --- llvm/utils/lit/lit/TestRunner.py | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 820cbce962c1..f826bc91fb3e 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -608,7 +608,6 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): assert isinstance(cmd, ShUtil.Pipeline) procs = [] - negate_procs = [] default_stdin = subprocess.PIPE stderrTempFiles = [] opened_files = [] @@ -654,12 +653,6 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): if not args: raise InternalShellError(j, "Error: 'not' requires a" " subcommand") - elif args[0] == '!': - not_args.append(args.pop(0)) - not_count += 1 - if not args: - raise InternalShellError(j, "Error: '!' requires a" - " subcommand") else: break @@ -706,15 +699,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): # the assumptions that (1) environment variables are not intended to be # relevant to 'not' commands and (2) the 'env' command should always # blindly pass along the status it receives from any command it calls. - - # For plain negations, either 'not' without '--crash', or the shell - # operator '!', leave them out from the command to execute and - # invert the result code afterwards. - if not_crash: - args = not_args + args - not_count = 0 - else: - not_args = [] + args = not_args + args stdin, stdout, stderr = processRedirects(j, default_stdin, cmd_shenv, opened_files) @@ -778,7 +763,6 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): stderr = stderr, env = cmd_shenv.env, close_fds = kUseCloseFDs)) - negate_procs.append((not_count % 2) != 0) # Let the helper know about this process timeoutHelper.addProcess(procs[-1]) except OSError as e: @@ -831,8 +815,6 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper): # Detect Ctrl-C in subprocess. if res == -signal.SIGINT: raise KeyboardInterrupt - if negate_procs[i]: - res = not res # Ensure the resulting output is always of string type. try: -- GitLab From c96dfe0d8bfbc3d4e08af33d5036e2453524b97a Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 18 Mar 2021 16:13:16 +0000 Subject: [PATCH 0130/1000] [AMDGPU] Sink Intrinsic::getDeclaration calls to where they are used. NFC. --- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 3f913cd9cba8..1b98eb04e0d8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -287,10 +287,6 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); - Function *PermLaneX16 = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {}); - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( @@ -317,9 +313,9 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes // 48..63). - Value *const PermX = - B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1), - B.getFalse(), B.getFalse()}); + Value *const PermX = B.CreateIntrinsic( + Intrinsic::amdgcn_permlanex16, {}, + {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}); V = buildNonAtomicBinOp( B, Op, V, B.CreateCall(UpdateDPP, @@ -327,7 +323,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, B.getInt32(0xa), B.getInt32(0xf), B.getFalse()})); if (!ST->isWave32()) { // Combine lane 31 into lanes 32..63. - Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)}); + Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, + {V, B.getInt32(31)}); V = buildNonAtomicBinOp( B, Op, V, B.CreateCall(UpdateDPP, @@ -346,10 +343,6 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, Module *M = B.GetInsertBlock()->getModule(); Function *UpdateDPP = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty); - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); - Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. @@ -357,6 +350,11 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { + Function *ReadLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {}); + Function *WriteLane = + Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {}); + // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. Value *Old = V; -- GitLab From 685335a0146e6fbb5f6841ff5a9ebce8cdccc6e7 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 18 Mar 2021 16:52:04 +0000 Subject: [PATCH 0131/1000] [AMDGPU] Remove duplicate test functions. NFC. --- .../atomic_optimizations_local_pointer.ll | 802 ++++-------------- 1 file changed, 164 insertions(+), 638 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index 1753cdcf407b..b797e3efc373 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -575,474 +575,6 @@ entry: ret void } -define amdgpu_kernel void @add_i32_varying_gfx1032(i32 addrspace(1)* %out) { -; -; -; GFX7LESS-LABEL: add_i32_varying_gfx1032: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: add_i32_varying_gfx1032: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB3_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB3_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: add_i32_varying_gfx1032: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB3_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB3_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: add_i32_varying_gfx1032: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB3_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB3_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: add_i32_varying_gfx1032: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB3_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB3_2: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel - store i32 %old, i32 addrspace(1)* %out - ret void -} - -define amdgpu_kernel void @add_i32_varying_gfx1064(i32 addrspace(1)* %out) { -; -; -; GFX7LESS-LABEL: add_i32_varying_gfx1064: -; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX7LESS-NEXT: s_endpgm -; -; GFX8-LABEL: add_i32_varying_gfx1064: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v2, v0 -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: v_mov_b32_e32 v2, 0 -; GFX8-NEXT: s_not_b64 exec, exec -; GFX8-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX8-NEXT: v_readlane_b32 s4, v2, 63 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX8-NEXT: s_mov_b64 exec, s[2:3] -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB4_2 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX8-NEXT: v_mov_b32_e32 v3, s4 -; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 -; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: add_i32_varying_gfx1064: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: s_not_b64 exec, exec -; GFX9-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf -; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf -; GFX9-NEXT: v_readlane_b32 s4, v2, 63 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf -; GFX9-NEXT: s_mov_b64 exec, s[2:3] -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB4_2 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo -; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 -; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: s_endpgm -; -; GFX1064-LABEL: add_i32_varying_gfx1064: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, v0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_not_b64 exec, exec -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v3, 0 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1064-NEXT: v_mov_b32_e32 v2, v1 -; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1064-NEXT: v_mov_b32_e32 v2, s4 -; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GFX1064-NEXT: v_readlane_b32 s4, v1, 15 -; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s5, v1, 31 -; GFX1064-NEXT: v_writelane_b32 v3, s4, 16 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GFX1064-NEXT: v_readlane_b32 s7, v1, 63 -; GFX1064-NEXT: v_readlane_b32 s6, v1, 47 -; GFX1064-NEXT: v_writelane_b32 v3, s5, 32 -; GFX1064-NEXT: s_mov_b64 exec, s[2:3] -; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 -; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GFX1064-NEXT: v_writelane_b32 v3, s6, 48 -; GFX1064-NEXT: s_mov_b64 exec, s[4:5] -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: ; implicit-def: $vgpr0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB4_2 -; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1064-NEXT: v_mov_b32_e32 v4, s7 -; GFX1064-NEXT: s_mov_b32 s3, s7 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB4_2: -; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1064-NEXT: v_mov_b32_e32 v0, v3 -; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: add_i32_varying_gfx1064: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, v0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX1032-NEXT: v_mov_b32_e32 v2, v1 -; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GFX1032-NEXT: v_mov_b32_e32 v3, 0 -; GFX1032-NEXT: v_readlane_b32 s3, v1, 15 -; GFX1032-NEXT: v_readlane_b32 s4, v1, 31 -; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 -; GFX1032-NEXT: v_writelane_b32 v3, s3, 16 -; GFX1032-NEXT: s_mov_b32 exec_lo, s2 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: ; implicit-def: $vgpr0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB4_2 -; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo -; GFX1032-NEXT: v_mov_b32_e32 v4, s4 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB4_2: -; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 -; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 -; GFX1032-NEXT: v_mov_b32_e32 v0, v3 -; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm -entry: - %lane = call i32 @llvm.amdgcn.workitem.id.x() - %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel - store i32 %old, i32 addrspace(1)* %out - ret void -} - define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; ; @@ -1055,7 +587,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB5_2 +; GFX7LESS-NEXT: s_cbranch_execz BB3_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1066,7 +598,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB5_2: +; GFX7LESS-NEXT: BB3_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -1090,7 +622,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB5_2 +; GFX8-NEXT: s_cbranch_execz BB3_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -1101,7 +633,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB5_2: +; GFX8-NEXT: BB3_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1124,7 +656,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB5_2 +; GFX9-NEXT: s_cbranch_execz BB3_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -1134,7 +666,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB5_2: +; GFX9-NEXT: BB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1157,7 +689,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB5_2 +; GFX1064-NEXT: s_cbranch_execz BB3_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1169,7 +701,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB5_2: +; GFX1064-NEXT: BB3_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1189,7 +721,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB5_2 +; GFX1032-NEXT: s_cbranch_execz BB3_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1201,7 +733,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB5_2: +; GFX1032-NEXT: BB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -1230,7 +762,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB6_2 +; GFX7LESS-NEXT: s_cbranch_execz BB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1245,7 +777,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB6_2: +; GFX7LESS-NEXT: BB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -1273,7 +805,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB6_2 +; GFX8-NEXT: s_cbranch_execz BB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 @@ -1288,7 +820,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB6_2: +; GFX8-NEXT: BB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 @@ -1316,7 +848,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB6_2 +; GFX9-NEXT: s_cbranch_execz BB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1330,7 +862,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB6_2: +; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -1358,7 +890,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB6_2 +; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1374,7 +906,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB6_2: +; GFX1064-NEXT: BB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1399,7 +931,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB6_2 +; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1415,7 +947,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB6_2: +; GFX1032-NEXT: BB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1437,9 +969,6 @@ entry: ret void } -; GCN-NOT: v_mbcnt_lo_u32_b32 -; GCN-NOT: v_mbcnt_hi_u32_b32 -; GCN-NOT: s_bcnt1_i32_b64 define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { ; ; @@ -1533,7 +1062,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB8_2 +; GFX7LESS-NEXT: s_cbranch_execz BB6_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 @@ -1543,7 +1072,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB8_2: +; GFX7LESS-NEXT: BB6_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -1563,7 +1092,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB8_2 +; GFX8-NEXT: s_cbranch_execz BB6_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_mul_i32 s2, s2, 5 @@ -1573,7 +1102,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB8_2: +; GFX8-NEXT: BB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1594,7 +1123,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB8_2 +; GFX9-NEXT: s_cbranch_execz BB6_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 @@ -1603,7 +1132,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB8_2: +; GFX9-NEXT: BB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1624,7 +1153,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB8_2 +; GFX1064-NEXT: s_cbranch_execz BB6_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1635,7 +1164,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB8_2: +; GFX1064-NEXT: BB6_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1655,7 +1184,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB8_2 +; GFX1032-NEXT: s_cbranch_execz BB6_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1666,7 +1195,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB8_2: +; GFX1032-NEXT: BB6_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -1696,7 +1225,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB9_2 +; GFX7LESS-NEXT: s_cbranch_execz BB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1707,7 +1236,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB9_2: +; GFX7LESS-NEXT: BB7_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 @@ -1728,7 +1257,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8-NEXT: s_cbranch_execz BB9_2 +; GFX8-NEXT: s_cbranch_execz BB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1739,7 +1268,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB9_2: +; GFX8-NEXT: BB7_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1760,7 +1289,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz BB9_2 +; GFX9-NEXT: s_cbranch_execz BB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1770,7 +1299,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB9_2: +; GFX9-NEXT: BB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1792,7 +1321,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz BB9_2 +; GFX1064-NEXT: s_cbranch_execz BB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1804,7 +1333,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB9_2: +; GFX1064-NEXT: BB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1826,7 +1355,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB9_2 +; GFX1032-NEXT: s_cbranch_execz BB7_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1838,7 +1367,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB9_2: +; GFX1032-NEXT: BB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1902,7 +1431,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB10_2 +; GFX8-NEXT: s_cbranch_execz BB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -1910,7 +1439,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB10_2: +; GFX8-NEXT: BB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -1953,14 +1482,14 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB10_2 +; GFX9-NEXT: s_cbranch_execz BB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB10_2: +; GFX9-NEXT: BB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2012,7 +1541,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB10_2 +; GFX1064-NEXT: s_cbranch_execz BB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2022,7 +1551,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB10_2: +; GFX1064-NEXT: BB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -2063,7 +1592,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB10_2 +; GFX1032-NEXT: s_cbranch_execz BB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -2072,7 +1601,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB10_2: +; GFX1032-NEXT: BB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -2101,7 +1630,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB11_2 +; GFX7LESS-NEXT: s_cbranch_execz BB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -2112,7 +1641,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB11_2: +; GFX7LESS-NEXT: BB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -2136,7 +1665,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB11_2 +; GFX8-NEXT: s_cbranch_execz BB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -2147,7 +1676,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB11_2: +; GFX8-NEXT: BB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 @@ -2171,7 +1700,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB11_2 +; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -2181,7 +1710,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB11_2: +; GFX9-NEXT: BB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s3, v2 @@ -2205,7 +1734,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB11_2 +; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -2217,7 +1746,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB11_2: +; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -2240,7 +1769,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB11_2 +; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -2252,7 +1781,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB11_2: +; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -2284,7 +1813,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB12_2 +; GFX7LESS-NEXT: s_cbranch_execz BB10_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -2299,7 +1828,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB12_2: +; GFX7LESS-NEXT: BB10_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -2327,7 +1856,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB12_2 +; GFX8-NEXT: s_cbranch_execz BB10_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 @@ -2342,7 +1871,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB12_2: +; GFX8-NEXT: BB10_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 @@ -2370,7 +1899,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB12_2 +; GFX9-NEXT: s_cbranch_execz BB10_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2384,7 +1913,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB12_2: +; GFX9-NEXT: BB10_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -2412,7 +1941,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB12_2 +; GFX1064-NEXT: s_cbranch_execz BB10_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -2428,7 +1957,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB12_2: +; GFX1064-NEXT: BB10_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -2453,7 +1982,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB12_2 +; GFX1032-NEXT: s_cbranch_execz BB10_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -2469,7 +1998,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB12_2: +; GFX1032-NEXT: BB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2491,9 +2020,6 @@ entry: ret void } -; GCN-NOT: v_mbcnt_lo_u32_b32 -; GCN-NOT: v_mbcnt_hi_u32_b32 -; GCN-NOT: s_bcnt1_i32_b64 define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { ; ; @@ -2622,7 +2148,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB14_2 +; GFX8-NEXT: s_cbranch_execz BB12_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2630,7 +2156,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB14_2: +; GFX8-NEXT: BB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2673,14 +2199,14 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB14_2 +; GFX9-NEXT: s_cbranch_execz BB12_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB14_2: +; GFX9-NEXT: BB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2732,7 +2258,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB14_2 +; GFX1064-NEXT: s_cbranch_execz BB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2742,7 +2268,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB14_2: +; GFX1064-NEXT: BB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -2783,7 +2309,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB14_2 +; GFX1032-NEXT: s_cbranch_execz BB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -2792,7 +2318,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB14_2: +; GFX1032-NEXT: BB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -2856,7 +2382,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB15_2 +; GFX8-NEXT: s_cbranch_execz BB13_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2864,7 +2390,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB15_2: +; GFX8-NEXT: BB13_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2907,14 +2433,14 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB15_2 +; GFX9-NEXT: s_cbranch_execz BB13_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB15_2: +; GFX9-NEXT: BB13_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2966,7 +2492,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB15_2 +; GFX1064-NEXT: s_cbranch_execz BB13_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2976,7 +2502,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB15_2: +; GFX1064-NEXT: BB13_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3017,7 +2543,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB15_2 +; GFX1032-NEXT: s_cbranch_execz BB13_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3026,7 +2552,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB15_2: +; GFX1032-NEXT: BB13_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3090,7 +2616,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB16_2 +; GFX8-NEXT: s_cbranch_execz BB14_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -3098,7 +2624,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB16_2: +; GFX8-NEXT: BB14_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3141,14 +2667,14 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB16_2 +; GFX9-NEXT: s_cbranch_execz BB14_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB16_2: +; GFX9-NEXT: BB14_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3200,7 +2726,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB16_2 +; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -3210,7 +2736,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB16_2: +; GFX1064-NEXT: BB14_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3251,7 +2777,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB16_2 +; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3260,7 +2786,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB16_2: +; GFX1032-NEXT: BB14_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3324,7 +2850,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB17_2 +; GFX8-NEXT: s_cbranch_execz BB15_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -3332,7 +2858,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB17_2: +; GFX8-NEXT: BB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3375,14 +2901,14 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB17_2 +; GFX9-NEXT: s_cbranch_execz BB15_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB17_2: +; GFX9-NEXT: BB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3436,7 +2962,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB17_2 +; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -3446,7 +2972,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB17_2: +; GFX1064-NEXT: BB15_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3489,7 +3015,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB17_2 +; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3498,7 +3024,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB17_2: +; GFX1032-NEXT: BB15_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3526,7 +3052,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB18_2 +; GFX7LESS-NEXT: s_cbranch_execz BB16_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -3535,7 +3061,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB18_2: +; GFX7LESS-NEXT: BB16_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -3561,7 +3087,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB18_2 +; GFX8-NEXT: s_cbranch_execz BB16_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3570,7 +3096,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB18_2: +; GFX8-NEXT: BB16_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3596,7 +3122,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB18_2 +; GFX9-NEXT: s_cbranch_execz BB16_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3604,7 +3130,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB18_2: +; GFX9-NEXT: BB16_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3630,7 +3156,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB18_2 +; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3640,7 +3166,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB18_2: +; GFX1064-NEXT: BB16_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -3663,7 +3189,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB18_2 +; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3673,7 +3199,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB18_2: +; GFX1032-NEXT: BB16_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -3741,7 +3267,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB19_2 +; GFX8-NEXT: s_cbranch_execz BB17_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -3749,7 +3275,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB19_2: +; GFX8-NEXT: BB17_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3792,14 +3318,14 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB19_2 +; GFX9-NEXT: s_cbranch_execz BB17_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB19_2: +; GFX9-NEXT: BB17_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3853,7 +3379,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB19_2 +; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -3863,7 +3389,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB19_2: +; GFX1064-NEXT: BB17_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3906,7 +3432,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB19_2 +; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3915,7 +3441,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB19_2: +; GFX1032-NEXT: BB17_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3943,7 +3469,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB20_2 +; GFX7LESS-NEXT: s_cbranch_execz BB18_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -3952,7 +3478,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB20_2: +; GFX7LESS-NEXT: BB18_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -3978,7 +3504,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB20_2 +; GFX8-NEXT: s_cbranch_execz BB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3987,7 +3513,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB20_2: +; GFX8-NEXT: BB18_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 @@ -4013,7 +3539,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB20_2 +; GFX9-NEXT: s_cbranch_execz BB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4021,7 +3547,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB20_2: +; GFX9-NEXT: BB18_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 @@ -4047,7 +3573,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB20_2 +; GFX1064-NEXT: s_cbranch_execz BB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4057,7 +3583,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB20_2: +; GFX1064-NEXT: BB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -4080,7 +3606,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB20_2 +; GFX1032-NEXT: s_cbranch_execz BB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4090,7 +3616,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB20_2: +; GFX1032-NEXT: BB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -4158,7 +3684,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB21_2 +; GFX8-NEXT: s_cbranch_execz BB19_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -4166,7 +3692,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB21_2: +; GFX8-NEXT: BB19_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -4209,14 +3735,14 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB21_2 +; GFX9-NEXT: s_cbranch_execz BB19_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB21_2: +; GFX9-NEXT: BB19_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -4268,7 +3794,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB21_2 +; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -4278,7 +3804,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB21_2: +; GFX1064-NEXT: BB19_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -4319,7 +3845,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB21_2 +; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -4328,7 +3854,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB21_2: +; GFX1032-NEXT: BB19_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -4356,7 +3882,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB22_2 +; GFX7LESS-NEXT: s_cbranch_execz BB20_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4365,7 +3891,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB22_2: +; GFX7LESS-NEXT: BB20_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -4390,7 +3916,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB22_2 +; GFX8-NEXT: s_cbranch_execz BB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4399,7 +3925,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB22_2: +; GFX8-NEXT: BB20_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -4424,7 +3950,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB22_2 +; GFX9-NEXT: s_cbranch_execz BB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4432,7 +3958,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB22_2: +; GFX9-NEXT: BB20_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -4457,7 +3983,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB22_2 +; GFX1064-NEXT: s_cbranch_execz BB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4467,7 +3993,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB22_2: +; GFX1064-NEXT: BB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -4490,7 +4016,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB22_2 +; GFX1032-NEXT: s_cbranch_execz BB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4500,7 +4026,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB22_2: +; GFX1032-NEXT: BB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -4568,7 +4094,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB23_2 +; GFX8-NEXT: s_cbranch_execz BB21_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -4576,7 +4102,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB23_2: +; GFX8-NEXT: BB21_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -4619,14 +4145,14 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB23_2 +; GFX9-NEXT: s_cbranch_execz BB21_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB23_2: +; GFX9-NEXT: BB21_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -4678,7 +4204,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB23_2 +; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -4688,7 +4214,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB23_2: +; GFX1064-NEXT: BB21_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -4729,7 +4255,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB23_2 +; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -4738,7 +4264,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB23_2: +; GFX1032-NEXT: BB21_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -4766,7 +4292,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB24_2 +; GFX7LESS-NEXT: s_cbranch_execz BB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4775,7 +4301,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB24_2: +; GFX7LESS-NEXT: BB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -4800,7 +4326,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB24_2 +; GFX8-NEXT: s_cbranch_execz BB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4809,7 +4335,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB24_2: +; GFX8-NEXT: BB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -4834,7 +4360,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB24_2 +; GFX9-NEXT: s_cbranch_execz BB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4842,7 +4368,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB24_2: +; GFX9-NEXT: BB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -4867,7 +4393,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB24_2 +; GFX1064-NEXT: s_cbranch_execz BB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4877,7 +4403,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB24_2: +; GFX1064-NEXT: BB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -4900,7 +4426,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB24_2 +; GFX1032-NEXT: s_cbranch_execz BB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4910,7 +4436,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB24_2: +; GFX1032-NEXT: BB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -- GitLab From 5dd5ddcb41509dee9d830beaaee538e83fde8dff Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 18 Mar 2021 17:16:48 +0000 Subject: [PATCH 0132/1000] [AMDGPU] Skip building some IR if it won't be used. NFC. --- llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 1b98eb04e0d8..34a7cb5a72a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -478,6 +478,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, Value *ExclScan = nullptr; Value *NewV = nullptr; + const bool NeedResult = !I.use_empty(); + // If we have a divergent value in each lane, we need to combine the value // using DPP. if (ValDivergent) { @@ -488,7 +490,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, const AtomicRMWInst::BinOp ScanOp = Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op; NewV = buildScan(B, ScanOp, NewV, Identity); - ExclScan = buildShiftRight(B, NewV, Identity); + if (NeedResult) + ExclScan = buildShiftRight(B, NewV, Identity); // Read the value from the last lane, which has accumlated the values of // each active lane in the wavefront. This will be our new value which we @@ -581,7 +584,6 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // original instruction. B.SetInsertPoint(&I); - const bool NeedResult = !I.use_empty(); if (NeedResult) { // Create a PHI node to get our new atomic result into the exit block. PHINode *const PHI = B.CreatePHI(Ty, 2); -- GitLab From 5a5a531214c707f8d321743f5bfabfd6bbb73496 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 19 Mar 2021 09:27:11 +0000 Subject: [PATCH 0133/1000] [AMDGPU] Remove some redundant code. NFC. This is redundant because we have already checked that we can't handle divergent 64-bit atomic operands. --- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 21 ++----------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index 34a7cb5a72a8..147c88d82cf8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -497,25 +497,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I, // each active lane in the wavefront. This will be our new value which we // will provide to the atomic operation. Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1); - if (TyBitWidth == 64) { - Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty()); - Value *const ExtractHi = - B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty()); - CallInst *const ReadLaneLo = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx}); - CallInst *const ReadLaneHi = B.CreateIntrinsic( - Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx}); - Value *const PartialInsert = B.CreateInsertElement( - UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0)); - Value *const Insert = - B.CreateInsertElement(PartialInsert, ReadLaneHi, B.getInt32(1)); - NewV = B.CreateBitCast(Insert, Ty); - } else if (TyBitWidth == 32) { - NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, - {NewV, LastLaneIdx}); - } else { - llvm_unreachable("Unhandled atomic bit width"); - } + assert(TyBitWidth == 32); + NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {NewV, LastLaneIdx}); // Finally mark the readlanes in the WWM section. NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV); -- GitLab From 51884c6beff75b5b0d7dad50b67bf535f59bd7ae Mon Sep 17 00:00:00 2001 From: Ricky Taylor Date: Thu, 11 Mar 2021 20:35:04 +0000 Subject: [PATCH 0134/1000] [M68k] Introduce DReg bead This is required in order to determine during disassembly whether a Reg bead without associated DA bead is referring to a data register. Differential Revision: https://reviews.llvm.org/D98534 --- llvm/lib/Target/M68k/M68kInstrArithmetic.td | 34 +++++++++---------- llvm/lib/Target/M68k/M68kInstrBits.td | 6 ++-- llvm/lib/Target/M68k/M68kInstrFormats.td | 17 +++++----- llvm/lib/Target/M68k/M68kInstrShiftRotate.td | 6 ++-- .../Target/M68k/MCTargetDesc/M68kBaseInfo.h | 11 +++--- .../M68k/MCTargetDesc/M68kMCCodeEmitter.cpp | 2 ++ 6 files changed, 40 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td index f4714d2534bd..d6ecec07439d 100644 --- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td +++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td @@ -38,7 +38,7 @@ /// | | | EFFECTIVE ADDRESS /// x x x x | REG | OP MODE | MODE | REG /// ---------------------------------------------------- -class MxArithEncoding : MxEncoding; @@ -53,7 +53,7 @@ class MxArithEncoding + MxBeadDReg SRC, MxBeadDReg DST> : MxEncoding, SIZE, MxBead1Bit<0b1>, DST, CMD>; /// Encoding for Immediate forms @@ -88,13 +88,13 @@ let Defs = [CCR] in { let Constraints = "$src = $dst" in { // $reg, $ccr <- $reg op $reg -class MxBiArOp_RFRR_xEA CMD> +class MxBiArOp_RFRR_xEA CMD, MxBead REG> : MxInst<(outs TYPE.ROp:$dst), (ins TYPE.ROp:$src, TYPE.ROp:$opd), MN#"."#TYPE.Prefix#"\t$opd, $dst", [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd))], MxArithEncoding, !cast("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"), - MxBeadReg<0>, + REG, !cast("MxEncEA"#TYPE.RLet#"_2"), MxExtEmpty>>; @@ -110,7 +110,7 @@ class MxBiArOp_RFRR_EAd CMD> [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd))], MxArithEncoding, !cast("MxOpMode"#TYPE.Size#"EAd"), - MxBeadReg<2>, MxEncEAd_0, MxExtEmpty>>; + MxBeadDReg<2>, MxEncEAd_0, MxExtEmpty>>; // $reg <- $reg op $imm class MxBiArOp_RFRI_xEA CMD> @@ -119,7 +119,7 @@ class MxBiArOp_RFRI_xEA CMD> [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.IPat:$opd))], MxArithEncoding, !cast("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"), - MxBeadReg<0>, MxEncEAi, + MxBeadDReg<0>, MxEncEAi, !cast("MxExtI"#TYPE.Size#"_2")>>; // Again, there are two ways to write an immediate to Dn register either dEA @@ -141,7 +141,7 @@ class MxBiArOp_RFRM, !cast("MxOpMode"#TYPE.Size#TYPE.RLet#"EA"), - MxBeadReg<0>, EA, EXT>>; + MxBeadDReg<0>, EA, EXT>>; } // Constraints @@ -157,7 +157,7 @@ class MxBiArOp_FMR, !cast("MxOpMode"#TYPE.Size#"EA"#TYPE.RLet), - MxBeadReg<1>, EA, EXT>>; + MxBeadDReg<1>, EA, EXT>>; class MxBiArOp_FMI; - def NAME#"16dd" : MxBiArOp_RFRR_xEA; - def NAME#"32dd" : MxBiArOp_RFRR_xEA; + def NAME#"8dd" : MxBiArOp_RFRR_xEA>; + def NAME#"16dd" : MxBiArOp_RFRR_xEA>; + def NAME#"32dd" : MxBiArOp_RFRR_xEA>; } // isComm @@ -291,7 +291,7 @@ multiclass MxBiArOp_AF; let isCommutable = isComm in - def NAME#"32rr" : MxBiArOp_RFRR_xEA; + def NAME#"32rr" : MxBiArOp_RFRR_xEA>; } // MxBiArOp_AF @@ -313,7 +313,7 @@ class MxBiArOp_RFRRF CMD> [(set TYPE.VT:$dst, CCR, (NODE TYPE.VT:$src, TYPE.VT:$opd, CCR))], MxArithXEncoding, !cast("MxEncSize"#TYPE.Size), - MxBead1Bit<0>, MxBeadReg<2>, MxBeadReg<0>>>; + MxBead1Bit<0>, MxBeadDReg<2>, MxBeadDReg<0>>>; } // Constraints } // Uses, Defs @@ -372,7 +372,7 @@ class MxCmp_RR [(set CCR, (MxCmp TYPE.VT:$lhs, TYPE.VT:$rhs))], MxArithEncoding, !cast("MxOpMode"#TYPE.Size#"dEA"), - MxBeadReg<1>, MxEncEAd_0, MxExtEmpty>>; + MxBeadDReg<1>, MxEncEAd_0, MxExtEmpty>>; class MxCmp_RI : MxInst<(outs), (ins TYPE.IOp:$imm, TYPE.ROp:$reg), @@ -412,7 +412,7 @@ class MxCmp_RM, !cast("MxOpMode"#TYPE.Size#"dEA"), - MxBeadReg<0>, EA, EXT>>; + MxBeadDReg<0>, EA, EXT>>; } // let mayLoad = 1 } // let Defs = [CCR] @@ -474,7 +474,7 @@ def MxExtOpmode_lb : MxBead3Bits<0b111>; /// 0 1 0 0 1 0 0 | OPMODE | 0 0 0 | REG /// --------------------------------------------------- class MxExtEncoding - : MxEncoding, MxBead3Bits<0b000>, OPMODE, + : MxEncoding, MxBead3Bits<0b000>, OPMODE, MxBead3Bits<0b100>, MxBead4Bits<0b0100>>; let Defs = [CCR] in @@ -508,7 +508,7 @@ def MxUDiMuOpmode : MxBead3Bits<0b011>; /// x x x x | REG | OP MODE | MODE | REG /// ---------------------------------------------------- class MxDiMuEncoding - : MxEncoding, CMD, + : MxEncoding, CMD, EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>; let Defs = [CCR] in { diff --git a/llvm/lib/Target/M68k/M68kInstrBits.td b/llvm/lib/Target/M68k/M68kInstrBits.td index 96d536520939..d97ca50f74a9 100644 --- a/llvm/lib/Target/M68k/M68kInstrBits.td +++ b/llvm/lib/Target/M68k/M68kInstrBits.td @@ -32,7 +32,7 @@ /// ------------+---------+---------+---------+--------- /// 0 0 0 0 | REG | 1 0 0 | MODE | REG /// ------------+---------+---------+---------+--------- -class MxBTSTEnc_R +class MxBTSTEnc_R : MxEncoding, REG, MxBead4Bits<0b0000>, EXT.Imm, EXT.B8, EXT.Scale, EXT.WL, EXT.DAReg>; @@ -52,7 +52,7 @@ let Defs = [CCR] in { class MxBTST_RR : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst", [(set CCR, (MxBt TYPE.VT:$dst, TYPE.VT:$bitno))], - MxBTSTEnc_R, MxEncEAd_0, MxExtEmpty>>; + MxBTSTEnc_R, MxEncEAd_0, MxExtEmpty>>; class MxBTST_RI : MxInst<(outs), (ins TYPE.ROp:$dst, TYPE.IOp:$bitno), "btst\t$bitno, $dst", @@ -63,7 +63,7 @@ class MxBTST_MR : MxInst<(outs), (ins MEMOpd:$dst, TYPE.ROp:$bitno), "btst\t$bitno, $dst", [(set CCR, (MxBt (TYPE.Load MEMPat:$dst), TYPE.VT:$bitno))], - MxBTSTEnc_R, EA, EXT>>; + MxBTSTEnc_R, EA, EXT>>; class MxBTST_MI diff --git a/llvm/lib/Target/M68k/M68kInstrFormats.td b/llvm/lib/Target/M68k/M68kInstrFormats.td index b147537eb32b..1d950bd0377a 100644 --- a/llvm/lib/Target/M68k/M68kInstrFormats.td +++ b/llvm/lib/Target/M68k/M68kInstrFormats.td @@ -95,16 +95,17 @@ class MxBead4Bits b> : MxBead<0x4, b{0}, b{1}, b{2}, b{3}>; class MxBeadDAReg o, bit a = 0> : MxBead<0x5, o{0}, o{1}, o{2}, a>; class MxBeadDA o, bit a = 0> : MxBead<0x6, o{0}, o{1}, o{2}, a>; class MxBeadReg o, bit a = 0> : MxBead<0x7, o{0}, o{1}, o{2}, a>; -class MxBead8Disp o, bit a = 0> : MxBead<0x8, o{0}, o{1}, o{2}, a>; +class MxBeadDReg o, bit a = 0> : MxBead<0x8, o{0}, o{1}, o{2}, a>; +class MxBead8Disp o, bit a = 0> : MxBead<0x9, o{0}, o{1}, o{2}, a>; /// Add Immediate to the instruction. 8-bit version is padded with zeros to fit /// the word. -class MxBead8Imm o, bit a = 0> : MxBead<0x9, o{0}, o{1}, o{2}, a>; -class MxBead16Imm o, bit a = 0> : MxBead<0xA, o{0}, o{1}, o{2}, a>; -class MxBead32Imm o, bit a = 0> : MxBead<0xB, o{0}, o{1}, o{2}, a>; +class MxBead8Imm o, bit a = 0> : MxBead<0xA, o{0}, o{1}, o{2}, a>; +class MxBead16Imm o, bit a = 0> : MxBead<0xB, o{0}, o{1}, o{2}, a>; +class MxBead32Imm o, bit a = 0> : MxBead<0xC, o{0}, o{1}, o{2}, a>; /// Encodes an immediate 0-7(alt. 1-8) into 3 bit field -class MxBead3Imm o, bit a = 0> : MxBead<0xC, o{0}, o{1}, o{2}, a>; +class MxBead3Imm o, bit a = 0> : MxBead<0xD, o{0}, o{1}, o{2}, a>; class MxEncoding { // FIXME: Is there a way to factorize the addressing mode suffix (i.e. // 'r', 'd', 'a' etc.) and use something like multiclass to replace? def MxEncEAr_0: MxEncEA, MxBead2Bits<0b00>>; -def MxEncEAd_0: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<0>>; +def MxEncEAd_0: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<0>>; def MxEncEAa_0: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<1>>; def MxEncEAj_0: MxEncEA, MxBead2Bits<0b01>, MxBead1Bit<0>>; def MxEncEAo_0: MxEncEA, MxBead2Bits<0b01>, MxBead1Bit<1>>; @@ -214,7 +215,7 @@ def MxEncEAa_0_reflected : MxEncEA, MxBead3Bits<0b001>>; def MxEncEAr_0_reflected : MxEncEA, MxBead2Bits<0b00>, MxBeadDA<0>>; def MxEncEAr_1: MxEncEA, MxBead2Bits<0b00>>; -def MxEncEAd_1: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<0>>; +def MxEncEAd_1: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<0>>; def MxEncEAa_1: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<1>>; def MxEncEAj_1: MxEncEA, MxBead2Bits<0b01>, MxBead1Bit<0>>; def MxEncEAo_1: MxEncEA, MxBead2Bits<0b01>, MxBead1Bit<1>>; @@ -223,7 +224,7 @@ def MxEncEAp_1: MxEncEA, MxBead2Bits<0b10>, MxBead1Bit<1>>; def MxEncEAf_1: MxEncEA, MxBead2Bits<0b11>, MxBead1Bit<0>>; def MxEncEAr_2: MxEncEA, MxBead2Bits<0b00>>; -def MxEncEAd_2: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<0>>; +def MxEncEAd_2: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<0>>; def MxEncEAa_2: MxEncEA, MxBead2Bits<0b00>, MxBead1Bit<1>>; def MxEncEAj_2: MxEncEA, MxBead2Bits<0b01>, MxBead1Bit<0>>; def MxEncEAo_2: MxEncEA, MxBead2Bits<0b01>, MxBead1Bit<1>>; diff --git a/llvm/lib/Target/M68k/M68kInstrShiftRotate.td b/llvm/lib/Target/M68k/M68kInstrShiftRotate.td index f777a5d33e21..cab687638076 100644 --- a/llvm/lib/Target/M68k/M68kInstrShiftRotate.td +++ b/llvm/lib/Target/M68k/M68kInstrShiftRotate.td @@ -38,11 +38,11 @@ def MxROOP_RO : MxBead2Bits<0b11>; /// 1 1 1 0 | REG/IMM | D | SIZE |R/I| OP | REG /// ------------+---------+---+------+---+------+--------- class MxSREncoding_R - : MxEncoding, ROOP, MxBead1Bit<1>, SIZE, DIRECTION, - MxBeadReg<2>, MxBead4Bits<0b1110>>; + : MxEncoding, ROOP, MxBead1Bit<1>, SIZE, DIRECTION, + MxBeadDReg<2>, MxBead4Bits<0b1110>>; class MxSREncoding_I - : MxEncoding, ROOP, MxBead1Bit<0>, SIZE, DIRECTION, + : MxEncoding, ROOP, MxBead1Bit<0>, SIZE, DIRECTION, MxBead3Imm<2, 1>, MxBead4Bits<0b1110>>; // $reg <- $reg op $reg diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h index 36592fda1a96..eac4ded71aab 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h @@ -58,11 +58,12 @@ enum { DAReg = 0x5, DA = 0x6, Reg = 0x7, - Disp8 = 0x8, - Imm8 = 0x9, - Imm16 = 0xA, - Imm32 = 0xB, - Imm3 = 0xC, + DReg = 0x8, + Disp8 = 0x9, + Imm8 = 0xA, + Imm16 = 0xB, + Imm32 = 0xC, + Imm3 = 0xD, }; // Ctrl payload diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp index b8579227be1b..9708abaadf98 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp @@ -121,6 +121,7 @@ unsigned M68kMCCodeEmitter::encodeReg(unsigned ThisByte, uint8_t Bead, Reg = false; DA = true; break; + case M68kBeads::DReg: case M68kBeads::Reg: Reg = true; DA = false; @@ -351,6 +352,7 @@ void M68kMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, break; case M68kBeads::DAReg: case M68kBeads::DA: + case M68kBeads::DReg: case M68kBeads::Reg: Offset += encodeReg(ThisByte, Bead, MI, Desc, Buffer, Offset, Fixups, STI); -- GitLab From cd442157cff4aad209ae532cbf031abbe10bc1df Mon Sep 17 00:00:00 2001 From: Ricky Taylor Date: Thu, 11 Mar 2021 20:37:00 +0000 Subject: [PATCH 0135/1000] [M68k] Convert register Aliases to AltNames This makes it simpler to determine when two registers are actually the same vs just partially aliasing. The only real caveat is that it becomes impossible to know which name was used for the register previously. (i.e. parsing assembly and then disassembling it can result in the register name changing.) Differential Revision: https://reviews.llvm.org/D98536 --- llvm/lib/Target/M68k/M68kRegisterInfo.td | 83 +++++++++---------- .../Target/M68k/MCTargetDesc/M68kBaseInfo.h | 5 +- 2 files changed, 42 insertions(+), 46 deletions(-) diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.td b/llvm/lib/Target/M68k/M68kRegisterInfo.td index 76e762c718b0..e2ea2967f75b 100644 --- a/llvm/lib/Target/M68k/M68kRegisterInfo.td +++ b/llvm/lib/Target/M68k/M68kRegisterInfo.td @@ -15,8 +15,8 @@ class MxReg ENC, list SUBREGS = [], list SUBIDX, - list DWREGS = []> - : Register, DwarfRegNum { + list DWREGS = [], list ALTNAMES = []> + : Register, DwarfRegNum { let Namespace = "M68k"; let HWEncoding = ENC; let SubRegs = SUBREGS; @@ -29,46 +29,45 @@ let Namespace = "M68k" in { def MxSubRegIndex16Lo : SubRegIndex<16, 0>; } -// Generate Data registers and theirs smaller variants -foreach Index = 0-7 in { - def "BD"#Index : MxReg<"d"#Index, Index, [], [], [Index]>; - - def "WD"#Index - : MxReg<"d"#Index, Index, - [!cast("BD"#Index)], [MxSubRegIndex8Lo], - [Index]>; - - def "D"#Index - : MxReg<"d"#Index, Index, - [!cast("WD"#Index)], [MxSubRegIndex16Lo], - [Index]>; - -} // foreach - -// Generate Address registers and theirs smaller variants -foreach Index = 0-7 in { - def "WA"#Index - : MxReg<"a"#Index, Index, [], [], [!add(8,Index)]>; - - def "A"#Index - : MxReg<"a"#Index, Index, - [!cast("WA"#Index)], [MxSubRegIndex16Lo], - [!add(8,Index)]>; +multiclass MxDataRegister ALTNAMES = []> { + def "B"#NAME : MxReg; + def "W"#NAME + : MxReg("B"#NAME)], [MxSubRegIndex8Lo], + [INDEX], ALTNAMES>; + def NAME + : MxReg("W"#NAME)], [MxSubRegIndex16Lo], + [INDEX], ALTNAMES>; } -// Alias Registers -class MxAliasReg - : MxReg { - let Aliases = [REG]; +multiclass MxAddressRegister ALTNAMES = []> { + def "W"#NAME + : MxReg; + def NAME + : MxReg("W"#NAME)], [MxSubRegIndex16Lo], + [!add(8,INDEX)], ALTNAMES>; } -def BP : MxAliasReg<"bp", A5>; -def FP : MxAliasReg<"fp", A6>; -def SP : MxAliasReg<"sp", A7>; +defm D0 : MxDataRegister<0, "d0">; +defm D1 : MxDataRegister<1, "d1">; +defm D2 : MxDataRegister<2, "d2">; +defm D3 : MxDataRegister<3, "d3">; +defm D4 : MxDataRegister<4, "d4">; +defm D5 : MxDataRegister<5, "d5">; +defm D6 : MxDataRegister<6, "d6">; +defm D7 : MxDataRegister<7, "d7">; + +defm A0 : MxAddressRegister<0, "a0">; +defm A1 : MxAddressRegister<1, "a1">; +defm A2 : MxAddressRegister<2, "a2">; +defm A3 : MxAddressRegister<3, "a3">; +defm A4 : MxAddressRegister<4, "a4">; +defm A5 : MxAddressRegister<5, "a5", ["bp"]>; +defm A6 : MxAddressRegister<6, "a6", ["fp"]>; +defm SP : MxAddressRegister<7, "sp", ["usp", "ssp", "isp", "a7"]>; -def USP : MxAliasReg<"usp", A7>; -def SSP : MxAliasReg<"ssp", A7>; -def ISP : MxAliasReg<"isp", A7>; // Pseudo Registers class MxPseudoReg SUBREGS = [], list SUBIDX = []> @@ -92,10 +91,10 @@ def DR16 : MxRegClass<[i16], 16, (sequence "WD%u", 0, 7)>; def DR32 : MxRegClass<[i32], 32, (sequence "D%u", 0, 7)>; // Address Registers -def AR16 : MxRegClass<[i16], 16, (sequence "WA%u", 0, 6)>; +def AR16 : MxRegClass<[i16], 16, (add (sequence "WA%u", 0, 6), WSP)>; def AR32 : MxRegClass<[i32], 32, (add (sequence "A%u", 0, 6), SP)>; -def AR32_NOSP : MxRegClass<[i32], 32, (add (sequence "A%u", 0, 6))>; +def AR32_NOSP : MxRegClass<[i32], 32, (sequence "A%u", 0, 6)>; // Index Register Classes // FIXME try alternative ordering like `D0, D1, A0, A1, ...` @@ -124,7 +123,5 @@ def XR16_TC : MxRegClass<[i16], 16, (add DR16_TC, AR16_TC)>; def XR32_TC : MxRegClass<[i32], 32, (add DR32_TC, AR32_TC)>; // These classes provide spill/restore order if used with MOVEM instruction -def SPILL : MxRegClass<[i32], 32, (add (add (sequence "D%u", 0, 7), - (sequence "A%u", 0, 6)), SP)>; -def SPILL_R : MxRegClass<[i32], 32, (add SP, (add (sequence "A%u", 6, 0), - (sequence "D%u", 7, 0)))>; +def SPILL : MxRegClass<[i32], 32, (add XR32)>; +def SPILL_R : MxRegClass<[i32], 32, (add SP, (sequence "A%u", 6, 0), (sequence "D%u", 7, 0))>; diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h b/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h index eac4ded71aab..7c56cfdf3123 100644 --- a/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h +++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kBaseInfo.h @@ -182,7 +182,7 @@ static inline bool isAddressRegister(unsigned RegNo) { case M68k::WA4: case M68k::WA5: case M68k::WA6: - case M68k::WA7: + case M68k::WSP: case M68k::A0: case M68k::A1: case M68k::A2: @@ -190,7 +190,6 @@ static inline bool isAddressRegister(unsigned RegNo) { case M68k::A4: case M68k::A5: case M68k::A6: - case M68k::A7: case M68k::SP: return true; default: @@ -237,7 +236,7 @@ static inline unsigned getMaskedSpillRegister(unsigned order) { case 14: return M68k::A6; case 15: - return M68k::A7; + return M68k::SP; } } -- GitLab From c2313a45307e807a6ee08d3b32cf6e4d099849a6 Mon Sep 17 00:00:00 2001 From: Simonas Kazlauskas Date: Fri, 19 Mar 2021 02:18:34 +0200 Subject: [PATCH 0136/1000] [X86, NFC] Update stack-clash tests using the automated tooling This is in preparation of changes in this area (such as D98789 and D98906). Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D98909 --- .../CodeGen/X86/stack-clash-dynamic-alloca.ll | 106 ++++++----- .../X86/stack-clash-large-large-align.ll | 110 ++++++------ llvm/test/CodeGen/X86/stack-clash-large.ll | 81 +++++---- ...-medium-natural-probes-mutliple-objects.ll | 8 +- .../X86/stack-clash-medium-natural-probes.ll | 28 ++- llvm/test/CodeGen/X86/stack-clash-medium.ll | 53 +++--- .../stack-clash-small-alloc-medium-align.ll | 169 +++++++++--------- .../X86/stack-clash-small-large-align.ll | 108 ++++++----- llvm/test/CodeGen/X86/stack-clash-small.ll | 18 +- .../CodeGen/X86/stack-clash-unknown-call.ll | 33 ++-- 10 files changed, 341 insertions(+), 373 deletions(-) diff --git a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll index 6dd8b6ab5897..3af4bf72b4d8 100644 --- a/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll +++ b/llvm/test/CodeGen/X86/stack-clash-dynamic-alloca.ll @@ -1,71 +1,69 @@ -; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s -; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp +; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s +; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s define i32 @foo(i32 %n) local_unnamed_addr #0 { - %a = alloca i32, i32 %n, align 16 - %b = getelementptr inbounds i32, i32* %a, i64 1198 - store volatile i32 1, i32* %b - %c = load volatile i32, i32* %a - ret i32 %c -} - -attributes #0 = {"probe-stack"="inline-asm"} - ; CHECK-X86-64-LABEL: foo: ; CHECK-X86-64: # %bb.0: -; CHECK-X86-64-NEXT: pushq %rbp -; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 16 -; CHECK-X86-64-NEXT: .cfi_offset %rbp, -16 -; CHECK-X86-64-NEXT: movq %rsp, %rbp -; CHECK-X86-64-NEXT: .cfi_def_cfa_register %rbp -; CHECK-X86-64-NEXT: movq %rsp, %rax -; CHECK-X86-64-NEXT: movl %edi, %ecx -; CHECK-X86-64-NEXT: leaq 15(,%rcx,4), %rcx -; CHECK-X86-64-NEXT: andq $-16, %rcx -; CHECK-X86-64-NEXT: subq %rcx, %rax -; CHECK-X86-64-NEXT: cmpq %rsp, %rax -; CHECK-X86-64-NEXT: jge .LBB0_3 +; CHECK-X86-64-NEXT: pushq %rbp +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 16 +; CHECK-X86-64-NEXT: .cfi_offset %rbp, -16 +; CHECK-X86-64-NEXT: movq %rsp, %rbp +; CHECK-X86-64-NEXT: .cfi_def_cfa_register %rbp +; CHECK-X86-64-NEXT: movq %rsp, %rax +; CHECK-X86-64-NEXT: movl %edi, %ecx +; CHECK-X86-64-NEXT: leaq 15(,%rcx,4), %rcx +; CHECK-X86-64-NEXT: andq $-16, %rcx +; CHECK-X86-64-NEXT: subq %rcx, %rax +; CHECK-X86-64-NEXT: cmpq %rsp, %rax +; CHECK-X86-64-NEXT: jge .LBB0_3 ; CHECK-X86-64-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-64-NEXT: xorq $0, (%rsp) -; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-X86-64-NEXT: cmpq %rsp, %rax -; CHECK-X86-64-NEXT: jl .LBB0_2 +; CHECK-X86-64-NEXT: xorq $0, (%rsp) +; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-X86-64-NEXT: cmpq %rsp, %rax +; CHECK-X86-64-NEXT: jl .LBB0_2 ; CHECK-X86-64-NEXT: .LBB0_3: -; CHECK-X86-64-NEXT: movq %rax, %rsp -; CHECK-X86-64-NEXT: movl $1, 4792(%rax) -; CHECK-X86-64-NEXT: movl (%rax), %eax -; CHECK-X86-64-NEXT: movq %rbp, %rsp -; CHECK-X86-64-NEXT: popq %rbp -; CHECK-X86-64-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-X86-64-NEXT: retq - - +; CHECK-X86-64-NEXT: movq %rax, %rsp +; CHECK-X86-64-NEXT: movl $1, 4792(%rax) +; CHECK-X86-64-NEXT: movl (%rax), %eax +; CHECK-X86-64-NEXT: movq %rbp, %rsp +; CHECK-X86-64-NEXT: popq %rbp +; CHECK-X86-64-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-X86-64-NEXT: retq +; ; CHECK-X86-32-LABEL: foo: ; CHECK-X86-32: # %bb.0: -; CHECK-X86-32-NEXT: pushl %ebp +; CHECK-X86-32-NEXT: pushl %ebp ; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 8 ; CHECK-X86-32-NEXT: .cfi_offset %ebp, -8 -; CHECK-X86-32-NEXT: movl %esp, %ebp +; CHECK-X86-32-NEXT: movl %esp, %ebp ; CHECK-X86-32-NEXT: .cfi_def_cfa_register %ebp -; CHECK-X86-32-NEXT: subl $8, %esp -; CHECK-X86-32-NEXT: movl 8(%ebp), %ecx -; CHECK-X86-32-NEXT: movl %esp, %eax -; CHECK-X86-32-NEXT: leal 15(,%ecx,4), %ecx -; CHECK-X86-32-NEXT: andl $-16, %ecx -; CHECK-X86-32-NEXT: subl %ecx, %eax -; CHECK-X86-32-NEXT: cmpl %esp, %eax -; CHECK-X86-32-NEXT: jge .LBB0_3 +; CHECK-X86-32-NEXT: subl $8, %esp +; CHECK-X86-32-NEXT: movl 8(%ebp), %ecx +; CHECK-X86-32-NEXT: movl %esp, %eax +; CHECK-X86-32-NEXT: leal 15(,%ecx,4), %ecx +; CHECK-X86-32-NEXT: andl $-16, %ecx +; CHECK-X86-32-NEXT: subl %ecx, %eax +; CHECK-X86-32-NEXT: cmpl %esp, %eax +; CHECK-X86-32-NEXT: jge .LBB0_3 ; CHECK-X86-32-NEXT: .LBB0_2: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-32-NEXT: xorl $0, (%esp) -; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X86-32-NEXT: cmpl %esp, %eax +; CHECK-X86-32-NEXT: xorl $0, (%esp) +; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X86-32-NEXT: cmpl %esp, %eax ; CHECK-X86-32-NEXT: jl .LBB0_2 ; CHECK-X86-32-NEXT: .LBB0_3: -; CHECK-X86-32-NEXT: movl %eax, %esp -; CHECK-X86-32-NEXT: movl $1, 4792(%eax) -; CHECK-X86-32-NEXT: movl (%eax), %eax -; CHECK-X86-32-NEXT: movl %ebp, %esp -; CHECK-X86-32-NEXT: popl %ebp +; CHECK-X86-32-NEXT: movl %eax, %esp +; CHECK-X86-32-NEXT: movl $1, 4792(%eax) +; CHECK-X86-32-NEXT: movl (%eax), %eax +; CHECK-X86-32-NEXT: movl %ebp, %esp +; CHECK-X86-32-NEXT: popl %ebp ; CHECK-X86-32-NEXT: .cfi_def_cfa %esp, 4 ; CHECK-X86-32-NEXT: retl + %a = alloca i32, i32 %n, align 16 + %b = getelementptr inbounds i32, i32* %a, i64 1198 + store volatile i32 1, i32* %b + %c = load volatile i32, i32* %a + ret i32 %c +} +attributes #0 = {"probe-stack"="inline-asm"} diff --git a/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll index 6c981cb4ac91..5710252f6c7d 100644 --- a/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large-large-align.ll @@ -1,28 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp ; RUN: llc < %s | FileCheck %s - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define i32 @foo_noprotect() local_unnamed_addr { ; CHECK-LABEL: foo_noprotect: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: andq $-4096, %rsp # imm = 0xF000 -; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000 -; CHECK-NEXT: movl $1, 392(%rsp) -; CHECK-NEXT: movl $1, 28792(%rsp) -; CHECK-NEXT: movl (%rsp), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - - +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-4096, %rsp # imm = 0xF000 +; CHECK-NEXT: subq $73728, %rsp # imm = 0x12000 +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl $1, 28792(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %a = alloca i32, i64 18000, align 4096 %b0 = getelementptr inbounds i32, i32* %a, i64 98 %b1 = getelementptr inbounds i32, i32* %a, i64 7198 @@ -35,47 +33,43 @@ define i32 @foo_noprotect() local_unnamed_addr { define i32 @foo_protect() local_unnamed_addr #0 { ; CHECK-LABEL: foo_protect: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: andq $-4096, %r11 # imm = 0xF000 -; CHECK-NEXT: cmpq %rsp, %r11 -; CHECK-NEXT: je .LBB1_4 -; CHECK-NEXT:# %bb.1: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: cmpq %rsp, %r11 -; CHECK-NEXT: jb .LBB1_3 -; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: cmpq %rsp, %r11 -; CHECK-NEXT: jb .LBB1_2 -; CHECK-NEXT:.LBB1_3: -; CHECK-NEXT: movq %r11, %rsp -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT:.LBB1_4: -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000 -; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq %r11, %rsp -; CHECK-NEXT: jne .LBB1_5 -; CHECK-NEXT:# %bb.6: -; CHECK-NEXT: movl $1, 392(%rsp) -; CHECK-NEXT: movl $1, 28792(%rsp) -; CHECK-NEXT: movl (%rsp), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - - - - +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: andq $-4096, %r11 # imm = 0xF000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: je .LBB1_4 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: jb .LBB1_3 +; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: jb .LBB1_2 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: movq %r11, %rsp +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: subq $73728, %r11 # imm = 0x12000 +; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq %r11, %rsp +; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl $1, 28792(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %a = alloca i32, i64 18000, align 4096 %b0 = getelementptr inbounds i32, i32* %a, i64 98 %b1 = getelementptr inbounds i32, i32* %a, i64 7198 diff --git a/llvm/test/CodeGen/X86/stack-clash-large.ll b/llvm/test/CodeGen/X86/stack-clash-large.ll index dd53cd8f6964..9129e4ed40fd 100644 --- a/llvm/test/CodeGen/X86/stack-clash-large.ll +++ b/llvm/test/CodeGen/X86/stack-clash-large.ll @@ -1,8 +1,45 @@ -; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s -; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp +; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s +; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s define i32 @foo() local_unnamed_addr #0 { - +; CHECK-X86-64-LABEL: foo: +; CHECK-X86-64: # %bb.0: +; CHECK-X86-64-NEXT: movq %rsp, %r11 +; CHECK-X86-64-NEXT: subq $69632, %r11 # imm = 0x11000 +; CHECK-X86-64-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-X86-64-NEXT: movq $0, (%rsp) +; CHECK-X86-64-NEXT: cmpq %r11, %rsp +; CHECK-X86-64-NEXT: jne .LBB0_1 +; CHECK-X86-64-NEXT: # %bb.2: +; CHECK-X86-64-NEXT: subq $2248, %rsp # imm = 0x8C8 +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 71888 +; CHECK-X86-64-NEXT: movl $1, 264(%rsp) +; CHECK-X86-64-NEXT: movl $1, 28664(%rsp) +; CHECK-X86-64-NEXT: movl -128(%rsp), %eax +; CHECK-X86-64-NEXT: addq $71880, %rsp # imm = 0x118C8 +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 8 +; CHECK-X86-64-NEXT: retq +; +; CHECK-X86-32-LABEL: foo: +; CHECK-X86-32: # %bb.0: +; CHECK-X86-32-NEXT: movl %esp, %r11d +; CHECK-X86-32-NEXT: subl $69632, %r11d # imm = 0x11000 +; CHECK-X86-32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X86-32-NEXT: movl $0, (%esp) +; CHECK-X86-32-NEXT: cmpl %r11d, %esp +; CHECK-X86-32-NEXT: jne .LBB0_1 +; CHECK-X86-32-NEXT: # %bb.2: +; CHECK-X86-32-NEXT: subl $2380, %esp # imm = 0x94C +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 72016 +; CHECK-X86-32-NEXT: movl $1, 392(%esp) +; CHECK-X86-32-NEXT: movl $1, 28792(%esp) +; CHECK-X86-32-NEXT: movl (%esp), %eax +; CHECK-X86-32-NEXT: addl $72012, %esp # imm = 0x1194C +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 4 +; CHECK-X86-32-NEXT: retl %a = alloca i32, i64 18000, align 16 %b0 = getelementptr inbounds i32, i32* %a, i64 98 %b1 = getelementptr inbounds i32, i32* %a, i64 7198 @@ -13,41 +50,3 @@ define i32 @foo() local_unnamed_addr #0 { } attributes #0 = {"probe-stack"="inline-asm"} - -; CHECK-X86-64-LABEL: foo: -; CHECK-X86-64: # %bb.0: -; CHECK-X86-64-NEXT: movq %rsp, %r11 -; CHECK-X86-64-NEXT: subq $69632, %r11 # imm = 0x11000 -; CHECK-X86-64-NEXT: .LBB0_1: -; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-X86-64-NEXT: movq $0, (%rsp) -; CHECK-X86-64-NEXT: cmpq %r11, %rsp -; CHECK-X86-64-NEXT: jne .LBB0_1 -; CHECK-X86-64-NEXT:# %bb.2: -; CHECK-X86-64-NEXT: subq $2248, %rsp -; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 71888 -; CHECK-X86-64-NEXT: movl $1, 264(%rsp) -; CHECK-X86-64-NEXT: movl $1, 28664(%rsp) -; CHECK-X86-64-NEXT: movl -128(%rsp), %eax -; CHECK-X86-64-NEXT: addq $71880, %rsp # imm = 0x118C8 -; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 8 -; CHECK-X86-64-NEXT: retq - -; CHECK-X86-32-LABEL: foo: -; CHECK-X86-32: # %bb.0: -; CHECK-X86-32-NEXT: movl %esp, %r11d -; CHECK-X86-32-NEXT: subl $69632, %r11d # imm = 0x11000 -; CHECK-X86-32-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 -; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X86-32-NEXT: movl $0, (%esp) -; CHECK-X86-32-NEXT: cmpl %r11d, %esp -; CHECK-X86-32-NEXT: jne .LBB0_1 -; CHECK-X86-32-NEXT:# %bb.2: -; CHECK-X86-32-NEXT: subl $2380, %esp -; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 72016 -; CHECK-X86-32-NEXT: movl $1, 392(%esp) -; CHECK-X86-32-NEXT: movl $1, 28792(%esp) -; CHECK-X86-32-NEXT: movl (%esp), %eax -; CHECK-X86-32-NEXT: addl $72012, %esp # imm = 0x1194C -; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 4 -; CHECK-X86-32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll index 0fe492a93d0e..ecb30dfeb36e 100644 --- a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll +++ b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes-mutliple-objects.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp ; RUN: llc < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -11,9 +11,9 @@ define i32 @foo() local_unnamed_addr #0 { ; CHECK-NEXT: movq $0, (%rsp) ; CHECK-NEXT: subq $1784, %rsp # imm = 0x6F8 ; CHECK-NEXT: .cfi_def_cfa_offset 5888 -; CHECK-NEXT: movl $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl $2, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-NEXT: movl $1, 3872(%rsp) +; CHECK-NEXT: movl $2, 672(%rsp) +; CHECK-NEXT: movl 1872(%rsp), %eax ; CHECK-NEXT: addq $5880, %rsp # imm = 0x16F8 ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll index bb2be8846ec2..b682cf8ac965 100644 --- a/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll +++ b/llvm/test/CodeGen/X86/stack-clash-medium-natural-probes.ll @@ -1,26 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp ; RUN: llc < %s | FileCheck %s - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define i32 @foo() local_unnamed_addr #0 { - ; CHECK-LABEL: foo: -; CHECK: # %bb.0: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 -; CHECK-NEXT: .cfi_def_cfa_offset 7888 -; CHECK-NEXT: movl $1, 264(%rsp) -; CHECK-NEXT: movl $1, 4664(%rsp) -; CHECK-NEXT: movl -128(%rsp), %eax -; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq - - - +; CHECK: # %bb.0: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-NEXT: movl $1, 264(%rsp) +; CHECK-NEXT: movl $1, 4664(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $7880, %rsp # imm = 0x1EC8 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %a = alloca i32, i64 2000, align 16 %b0 = getelementptr inbounds i32, i32* %a, i64 98 %b1 = getelementptr inbounds i32, i32* %a, i64 1198 diff --git a/llvm/test/CodeGen/X86/stack-clash-medium.ll b/llvm/test/CodeGen/X86/stack-clash-medium.ll index 5a97074025f1..c40396fcead9 100644 --- a/llvm/test/CodeGen/X86/stack-clash-medium.ll +++ b/llvm/test/CodeGen/X86/stack-clash-medium.ll @@ -1,7 +1,31 @@ -; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s -; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp +; RUN: llc -mtriple=x86_64-linux-android < %s | FileCheck -check-prefix=CHECK-X86-64 %s +; RUN: llc -mtriple=i686-linux-android < %s | FileCheck -check-prefix=CHECK-X86-32 %s define i32 @foo() local_unnamed_addr #0 { +; CHECK-X86-64-LABEL: foo: +; CHECK-X86-64: # %bb.0: +; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-X86-64-NEXT: movq $0, (%rsp) +; CHECK-X86-64-NEXT: subq $3784, %rsp # imm = 0xEC8 +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 7888 +; CHECK-X86-64-NEXT: movl $1, 672(%rsp) +; CHECK-X86-64-NEXT: movl -128(%rsp), %eax +; CHECK-X86-64-NEXT: addq $7880, %rsp # imm = 0x1EC8 +; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 8 +; CHECK-X86-64-NEXT: retq +; +; CHECK-X86-32-LABEL: foo: +; CHECK-X86-32: # %bb.0: +; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 +; CHECK-X86-32-NEXT: movl $0, (%esp) +; CHECK-X86-32-NEXT: subl $3916, %esp # imm = 0xF4C +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 8016 +; CHECK-X86-32-NEXT: movl $1, 800(%esp) +; CHECK-X86-32-NEXT: movl (%esp), %eax +; CHECK-X86-32-NEXT: addl $8012, %esp # imm = 0x1F4C +; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 4 +; CHECK-X86-32-NEXT: retl %a = alloca i32, i64 2000, align 16 %b = getelementptr inbounds i32, i32* %a, i64 200 store volatile i32 1, i32* %b @@ -10,28 +34,3 @@ define i32 @foo() local_unnamed_addr #0 { } attributes #0 = {"probe-stack"="inline-asm"} - -; CHECK-X86-64-LABEL: foo: -; CHECK-X86-64: # %bb.0: -; CHECK-X86-64-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-X86-64-NEXT: movq $0, (%rsp) -; CHECK-X86-64-NEXT: subq $3784, %rsp # imm = 0xEC8 -; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 7888 -; CHECK-X86-64-NEXT: movl $1, 672(%rsp) -; CHECK-X86-64-NEXT: movl -128(%rsp), %eax -; CHECK-X86-64-NEXT: addq $7880, %rsp # imm = 0x1EC8 -; CHECK-X86-64-NEXT: .cfi_def_cfa_offset 8 -; CHECK-X86-64-NEXT: retq - - -; CHECK-X86-32-LABEL: foo: -; CHECK-X86-32: # %bb.0: -; CHECK-X86-32-NEXT: subl $4096, %esp # imm = 0x1000 -; CHECK-X86-32-NEXT: movl $0, (%esp) -; CHECK-X86-32-NEXT: subl $3916, %esp # imm = 0xF4C -; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 8016 -; CHECK-X86-32-NEXT: movl $1, 800(%esp) -; CHECK-X86-32-NEXT: movl (%esp), %eax -; CHECK-X86-32-NEXT: addl $8012, %esp # imm = 0x1F4C -; CHECK-X86-32-NEXT: .cfi_def_cfa_offset 4 -; CHECK-X86-32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll index 39b6c3640a60..221a2e36947e 100644 --- a/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll +++ b/llvm/test/CodeGen/X86/stack-clash-small-alloc-medium-align.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp ; RUN: llc < %s | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -6,21 +7,20 @@ target triple = "x86_64-unknown-linux-gnu" ; | case1 | alloca + align < probe_size define i32 @foo1(i64 %i) local_unnamed_addr #0 { ; CHECK-LABEL: foo1: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: andq $-64, %rsp -; CHECK-NEXT: subq $832, %rsp # imm = 0x340 -; CHECK-NEXT: movl $1, (%rsp,%rdi,4) -; CHECK-NEXT: movl (%rsp), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $832, %rsp # imm = 0x340 +; CHECK-NEXT: movl $1, (%rsp,%rdi,4) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %a = alloca i32, i32 200, align 64 %b = getelementptr inbounds i32, i32* %a, i64 %i store volatile i32 1, i32* %b @@ -31,25 +31,24 @@ define i32 @foo1(i64 %i) local_unnamed_addr #0 { ; | case2 | alloca > probe_size, align > probe_size define i32 @foo2(i64 %i) local_unnamed_addr #0 { ; CHECK-LABEL: foo2: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: andq $-2048, %rsp # imm = 0xF800 -; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 -; CHECK-NEXT: movl $1, (%rsp,%rdi,4) -; CHECK-NEXT: movl (%rsp), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-2048, %rsp # imm = 0xF800 +; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $2048, %rsp # imm = 0x800 +; CHECK-NEXT: movl $1, (%rsp,%rdi,4) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %a = alloca i32, i32 2000, align 2048 %b = getelementptr inbounds i32, i32* %a, i64 %i store volatile i32 1, i32* %b @@ -60,24 +59,22 @@ define i32 @foo2(i64 %i) local_unnamed_addr #0 { ; | case3 | alloca < probe_size, align < probe_size, alloca + align > probe_size define i32 @foo3(i64 %i) local_unnamed_addr #0 { ; CHECK-LABEL: foo3: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 -; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $1024, %rsp # imm = 0x400 -; CHECK-NEXT: movl $1, (%rsp,%rdi,4) -; CHECK-NEXT: movl (%rsp), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - - +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00 +; CHECK-NEXT: subq $3072, %rsp # imm = 0xC00 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $1024, %rsp # imm = 0x400 +; CHECK-NEXT: movl $1, (%rsp,%rdi,4) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %a = alloca i32, i32 1000, align 1024 %b = getelementptr inbounds i32, i32* %a, i64 %i store volatile i32 1, i32* %b @@ -88,40 +85,39 @@ define i32 @foo3(i64 %i) local_unnamed_addr #0 { ; | case4 | alloca + probe_size < probe_size, followed by dynamic alloca define i32 @foo4(i64 %i) local_unnamed_addr #0 { ; CHECK-LABEL: foo4: -; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: andq $-64, %rsp -; CHECK-NEXT: subq $896, %rsp # imm = 0x380 -; CHECK-NEXT: movq %rsp, %rbx -; CHECK-NEXT: .cfi_offset %rbx, -24 -; CHECK-NEXT: movl $1, (%rbx,%rdi,4) -; CHECK-NEXT: movl (%rbx), %ecx -; CHECK-NEXT: movq %rsp, %rax -; CHECK-NEXT: leaq 15(,%rcx,4), %rcx -; CHECK-NEXT: andq $-16, %rcx -; CHECK-NEXT: subq %rcx, %rax -; CHECK-NEXT: cmpq %rsp, %rax -; CHECK-NEXT: jge .LBB3_3 -; CHECK-NEXT:.LBB3_2: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: xorq $0, (%rsp) -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: cmpq %rsp, %rax -; CHECK-NEXT: jl .LBB3_2 -; CHECK-NEXT:.LBB3_3: -; CHECK-NEXT: andq $-64, %rax -; CHECK-NEXT: movq %rax, %rsp -; CHECK-NEXT: movl (%rax), %eax -; CHECK-NEXT: leaq -8(%rbp), %rsp -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: andq $-64, %rsp +; CHECK-NEXT: subq $896, %rsp # imm = 0x380 +; CHECK-NEXT: movq %rsp, %rbx +; CHECK-NEXT: .cfi_offset %rbx, -24 +; CHECK-NEXT: movl $1, (%rbx,%rdi,4) +; CHECK-NEXT: movl (%rbx), %ecx +; CHECK-NEXT: movq %rsp, %rax +; CHECK-NEXT: leaq 15(,%rcx,4), %rcx +; CHECK-NEXT: andq $-16, %rcx +; CHECK-NEXT: subq %rcx, %rax +; CHECK-NEXT: cmpq %rsp, %rax +; CHECK-NEXT: jge .LBB3_3 +; CHECK-NEXT: .LBB3_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: xorq $0, (%rsp) +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %rax +; CHECK-NEXT: jl .LBB3_2 +; CHECK-NEXT: .LBB3_3: +; CHECK-NEXT: andq $-64, %rax +; CHECK-NEXT: movq %rax, %rsp +; CHECK-NEXT: movl (%rax), %eax +; CHECK-NEXT: leaq -8(%rbp), %rsp +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %a = alloca i32, i32 200, align 64 %b = getelementptr inbounds i32, i32* %a, i64 %i store volatile i32 1, i32* %b @@ -132,4 +128,3 @@ define i32 @foo4(i64 %i) local_unnamed_addr #0 { } attributes #0 = {"probe-stack"="inline-asm"} - diff --git a/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll index e608bab90415..c0541a8077ba 100644 --- a/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll +++ b/llvm/test/CodeGen/X86/stack-clash-small-large-align.ll @@ -1,28 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp ; RUN: llc < %s | FileCheck %s - - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define i32 @foo_noprotect() local_unnamed_addr { ; CHECK-LABEL: foo_noprotect: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: andq $-65536, %rsp -; CHECK-NEXT: subq $65536, %rsp -; CHECK-NEXT: movl $1, 392(%rsp) -; CHECK-NEXT: movl (%rsp), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - - - +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-65536, %rsp # imm = 0xFFFF0000 +; CHECK-NEXT: subq $65536, %rsp # imm = 0x10000 +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %a = alloca i32, i64 100, align 65536 %b = getelementptr inbounds i32, i32* %a, i64 98 store volatile i32 1, i32* %b @@ -33,46 +29,42 @@ define i32 @foo_noprotect() local_unnamed_addr { define i32 @foo_protect() local_unnamed_addr #0 { ; CHECK-LABEL: foo_protect: ; CHECK: # %bb.0: -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %rsp, %rbp -; CHECK-NEXT: .cfi_def_cfa_register %rbp -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: andq $-65536, %r11 # imm = 0xFFFF0000 -; CHECK-NEXT: cmpq %rsp, %r11 -; CHECK-NEXT: je .LBB1_4 -; CHECK-NEXT:# %bb.1: -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: cmpq %rsp, %r11 -; CHECK-NEXT: jb .LBB1_3 -; CHECK-NEXT:.LBB1_2: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: cmpq %rsp, %r11 -; CHECK-NEXT: jb .LBB1_2 -; CHECK-NEXT:.LBB1_3: -; CHECK-NEXT: movq %r11, %rsp -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT:.LBB1_4: -; CHECK-NEXT: movq %rsp, %r11 -; CHECK-NEXT: subq $65536, %r11 # imm = 0x10000 -; CHECK-NEXT:.LBB1_5: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 -; CHECK-NEXT: movq $0, (%rsp) -; CHECK-NEXT: cmpq %r11, %rsp -; CHECK-NEXT: jne .LBB1_5 -; CHECK-NEXT:# %bb.6: -; CHECK-NEXT: movl $1, 392(%rsp) -; CHECK-NEXT: movl (%rsp), %eax -; CHECK-NEXT: movq %rbp, %rsp -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa %rsp, 8 -; CHECK-NEXT: retq - - - - +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: andq $-65536, %r11 # imm = 0xFFFF0000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: je .LBB1_4 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: jb .LBB1_3 +; CHECK-NEXT: .LBB1_2: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: cmpq %rsp, %r11 +; CHECK-NEXT: jb .LBB1_2 +; CHECK-NEXT: .LBB1_3: +; CHECK-NEXT: movq %r11, %rsp +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: movq %rsp, %r11 +; CHECK-NEXT: subq $65536, %r11 # imm = 0x10000 +; CHECK-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: cmpq %r11, %rsp +; CHECK-NEXT: jne .LBB1_5 +; CHECK-NEXT: # %bb.6: +; CHECK-NEXT: movl $1, 392(%rsp) +; CHECK-NEXT: movl (%rsp), %eax +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: retq %a = alloca i32, i64 100, align 65536 %b = getelementptr inbounds i32, i32* %a, i64 98 store volatile i32 1, i32* %b diff --git a/llvm/test/CodeGen/X86/stack-clash-small.ll b/llvm/test/CodeGen/X86/stack-clash-small.ll index bf40fe907dc2..ecfaf7a1c4f1 100644 --- a/llvm/test/CodeGen/X86/stack-clash-small.ll +++ b/llvm/test/CodeGen/X86/stack-clash-small.ll @@ -1,20 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp ; RUN: llc < %s | FileCheck %s - - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" define i32 @foo() local_unnamed_addr #0 { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $280, %rsp # imm = 0x118 -; CHECK-NEXT: .cfi_def_cfa_offset 288 -; CHECK-NEXT: movl $1, 264(%rsp) -; CHECK-NEXT: movl -128(%rsp), %eax -; CHECK-NEXT: addq $280, %rsp # imm = 0x118 -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq - +; CHECK-NEXT: subq $280, %rsp # imm = 0x118 +; CHECK-NEXT: .cfi_def_cfa_offset 288 +; CHECK-NEXT: movl $1, 264(%rsp) +; CHECK-NEXT: movl -128(%rsp), %eax +; CHECK-NEXT: addq $280, %rsp # imm = 0x118 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %a = alloca i32, i64 100, align 16 %b = getelementptr inbounds i32, i32* %a, i64 98 store volatile i32 1, i32* %b diff --git a/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll b/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll index 9294d70528fa..2df3eca65460 100644 --- a/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll +++ b/llvm/test/CodeGen/X86/stack-clash-unknown-call.ll @@ -1,28 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp ; RUN: llc < %s | FileCheck %s - - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg); -define void @foo() local_unnamed_addr #0 { - -;CHECK-LABEL: foo: -;CHECK: # %bb.0: -;CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 ; it's important that we don't use the call as a probe here -;CHECK-NEXT: movq $0, (%rsp) -;CHECK-NEXT: subq $3912, %rsp # imm = 0xF48 -;CHECK-NEXT: .cfi_def_cfa_offset 8016 -;CHECK-NEXT: movq %rsp, %rdi -;CHECK-NEXT: movl $8000, %edx # imm = 0x1F40 -;CHECK-NEXT: xorl %esi, %esi -;CHECK-NEXT: callq memset -;CHECK-NEXT: addq $8008, %rsp # imm = 0x1F48 -;CHECK-NEXT: .cfi_def_cfa_offset 8 -;CHECK-NEXT: retq - +define void @foo() local_unnamed_addr #0 { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $4096, %rsp # imm = 0x1000 +; CHECK-NEXT: movq $0, (%rsp) +; CHECK-NEXT: subq $3912, %rsp # imm = 0xF48 +; CHECK-NEXT: .cfi_def_cfa_offset 8016 +; CHECK-NEXT: movq %rsp, %rdi +; CHECK-NEXT: movl $8000, %edx # imm = 0x1F40 +; CHECK-NEXT: xorl %esi, %esi +; CHECK-NEXT: callq memset@PLT +; CHECK-NEXT: addq $8008, %rsp # imm = 0x1F48 +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq %a = alloca i8, i64 8000, align 16 call void @llvm.memset.p0i8.i64(i8* align 16 %a, i8 0, i64 8000, i1 false) ret void -- GitLab From 4f750f6ebc412869ce6bb28331313a9c9a9d9af7 Mon Sep 17 00:00:00 2001 From: Abhina Sreeskantharajan Date: Fri, 19 Mar 2021 08:09:01 -0400 Subject: [PATCH 0137/1000] [SystemZ][z/OS] Distinguish between text and binary files on z/OS This patch consists of the initial changes to help distinguish between text and binary content correctly on z/OS. I would like to get feedback from Windows users on setting OF_None for all ToolOutputFiles. This seems to have been done as an optimization to prevent CRLF translation on Windows in the past. Reviewed By: zibi Differential Revision: https://reviews.llvm.org/D97785 --- clang/lib/Frontend/CompilerInstance.cpp | 9 ++- clang/lib/Frontend/FrontendActions.cpp | 55 ++++++++++--------- .../StaticAnalyzer/Core/HTMLDiagnostics.cpp | 10 ++-- clang/tools/arcmt-test/arcmt-test.cpp | 12 +++- llvm/include/llvm/Support/FileSystem.h | 9 ++- llvm/include/llvm/Support/MemoryBuffer.h | 8 ++- llvm/lib/IRReader/IRReader.cpp | 4 +- llvm/lib/Support/MemoryBuffer.cpp | 26 +++++---- llvm/lib/Support/Path.cpp | 48 ++++++++-------- llvm/lib/Support/ToolOutputFile.cpp | 8 ++- llvm/lib/TableGen/Main.cpp | 9 +-- llvm/utils/FileCheck/FileCheck.cpp | 8 ++- 12 files changed, 121 insertions(+), 85 deletions(-) diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp index d40240b5b527..284b20cb400a 100644 --- a/clang/lib/Frontend/CompilerInstance.cpp +++ b/clang/lib/Frontend/CompilerInstance.cpp @@ -814,15 +814,18 @@ CompilerInstance::createOutputFileImpl(StringRef OutputPath, bool Binary, TempPath += OutputExtension; TempPath += ".tmp"; int fd; - std::error_code EC = - llvm::sys::fs::createUniqueFile(TempPath, fd, TempPath); + std::error_code EC = llvm::sys::fs::createUniqueFile( + TempPath, fd, TempPath, + Binary ? llvm::sys::fs::OF_None : llvm::sys::fs::OF_Text); if (CreateMissingDirectories && EC == llvm::errc::no_such_file_or_directory) { StringRef Parent = llvm::sys::path::parent_path(OutputPath); EC = llvm::sys::fs::create_directories(Parent); if (!EC) { - EC = llvm::sys::fs::createUniqueFile(TempPath, fd, TempPath); + EC = llvm::sys::fs::createUniqueFile(TempPath, fd, TempPath, + Binary ? llvm::sys::fs::OF_None + : llvm::sys::fs::OF_Text); } } diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp index 38b6f753134c..4e5043b6c75b 100644 --- a/clang/lib/Frontend/FrontendActions.cpp +++ b/clang/lib/Frontend/FrontendActions.cpp @@ -795,7 +795,7 @@ void PreprocessOnlyAction::ExecuteAction() { void PrintPreprocessedAction::ExecuteAction() { CompilerInstance &CI = getCompilerInstance(); // Output file may need to be set to 'Binary', to avoid converting Unix style - // line feeds () to Microsoft style line feeds (). + // line feeds () to Microsoft style line feeds () on Windows. // // Look to see what type of line endings the file uses. If there's a // CRLF, then we won't open the file up in binary mode. If there is @@ -807,30 +807,35 @@ void PrintPreprocessedAction::ExecuteAction() { // all of their source code on a single line. However, that is still a // concern, so if we scan for too long, we'll just assume the file should // be opened in binary mode. - bool BinaryMode = true; - const SourceManager& SM = CI.getSourceManager(); - if (llvm::Optional Buffer = - SM.getBufferOrNone(SM.getMainFileID())) { - const char *cur = Buffer->getBufferStart(); - const char *end = Buffer->getBufferEnd(); - const char *next = (cur != end) ? cur + 1 : end; - - // Limit ourselves to only scanning 256 characters into the source - // file. This is mostly a sanity check in case the file has no - // newlines whatsoever. - if (end - cur > 256) end = cur + 256; - - while (next < end) { - if (*cur == 0x0D) { // CR - if (*next == 0x0A) // CRLF - BinaryMode = false; - - break; - } else if (*cur == 0x0A) // LF - break; - - ++cur; - ++next; + + bool BinaryMode = false; + if (llvm::Triple(LLVM_HOST_TRIPLE).isOSWindows()) { + BinaryMode = true; + const SourceManager &SM = CI.getSourceManager(); + if (llvm::Optional Buffer = + SM.getBufferOrNone(SM.getMainFileID())) { + const char *cur = Buffer->getBufferStart(); + const char *end = Buffer->getBufferEnd(); + const char *next = (cur != end) ? cur + 1 : end; + + // Limit ourselves to only scanning 256 characters into the source + // file. This is mostly a sanity check in case the file has no + // newlines whatsoever. + if (end - cur > 256) + end = cur + 256; + + while (next < end) { + if (*cur == 0x0D) { // CR + if (*next == 0x0A) // CRLF + BinaryMode = false; + + break; + } else if (*cur == 0x0A) // LF + break; + + ++cur; + ++next; + } } } diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp index fe530bce4a3e..64fc32ea7554 100644 --- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp +++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp @@ -275,11 +275,11 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D, << "' absolute: " << EC.message() << '\n'; return; } - if (std::error_code EC = - llvm::sys::fs::createUniqueFile(Model, FD, ResultPath)) { - llvm::errs() << "warning: could not create file in '" << Directory - << "': " << EC.message() << '\n'; - return; + if (std::error_code EC = llvm::sys::fs::createUniqueFile( + Model, FD, ResultPath, llvm::sys::fs::OF_Text)) { + llvm::errs() << "warning: could not create file in '" << Directory + << "': " << EC.message() << '\n'; + return; } } else { int i = 1; diff --git a/clang/tools/arcmt-test/arcmt-test.cpp b/clang/tools/arcmt-test/arcmt-test.cpp index 940e622b8a68..e4764ad1f457 100644 --- a/clang/tools/arcmt-test/arcmt-test.cpp +++ b/clang/tools/arcmt-test/arcmt-test.cpp @@ -207,11 +207,15 @@ static bool performTransformations(StringRef resourcesPath, static bool filesCompareEqual(StringRef fname1, StringRef fname2) { using namespace llvm; - ErrorOr> file1 = MemoryBuffer::getFile(fname1); + ErrorOr> file1 = MemoryBuffer::getFile( + fname1, /*FileSize*/ -1, /*RequiresNullTerminator*/ true, + /*IsVolatile*/ false, /*IsText*/ true); if (!file1) return false; - ErrorOr> file2 = MemoryBuffer::getFile(fname2); + ErrorOr> file2 = MemoryBuffer::getFile( + fname2, /*FileSize*/ -1, /*RequiresNullTerminator*/ true, + /*IsVolatile*/ false, /*IsText*/ true); if (!file2) return false; @@ -240,7 +244,9 @@ static bool verifyTransformedFiles(ArrayRef resultFiles) { if (RemappingsFile.empty()) inputBuf = MemoryBuffer::getSTDIN(); else - inputBuf = MemoryBuffer::getFile(RemappingsFile); + inputBuf = MemoryBuffer::getFile(RemappingsFile, /*FileSize*/ -1, + /*RequiresNullTerminator*/ true, + /*IsVolatile*/ false, /*IsText*/ true); if (!inputBuf) { errs() << "error: could not read remappings input\n"; return true; diff --git a/llvm/include/llvm/Support/FileSystem.h b/llvm/include/llvm/Support/FileSystem.h index d82e966215dc..1dc88243e835 100644 --- a/llvm/include/llvm/Support/FileSystem.h +++ b/llvm/include/llvm/Support/FileSystem.h @@ -802,10 +802,13 @@ void createUniquePath(const Twine &Model, SmallVectorImpl &ResultPath, /// @param Model Name to base unique path off of. /// @param ResultFD Set to the opened file's file descriptor. /// @param ResultPath Set to the opened file's absolute path. +/// @param Flags Set to the opened file's flags. +/// @param Mode Set to the opened file's permissions. /// @returns errc::success if Result{FD,Path} have been successfully set, /// otherwise a platform-specific error_code. std::error_code createUniqueFile(const Twine &Model, int &ResultFD, SmallVectorImpl &ResultPath, + OpenFlags Flags = OF_None, unsigned Mode = all_read | all_write); /// Simpler version for clients that don't want an open file. An empty @@ -862,12 +865,14 @@ public: /// running the assembler. std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix, int &ResultFD, - SmallVectorImpl &ResultPath); + SmallVectorImpl &ResultPath, + OpenFlags Flags = OF_None); /// Simpler version for clients that don't want an open file. An empty /// file will still be created. std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix, - SmallVectorImpl &ResultPath); + SmallVectorImpl &ResultPath, + OpenFlags Flags = OF_None); std::error_code createUniqueDirectory(const Twine &Prefix, SmallVectorImpl &ResultPath); diff --git a/llvm/include/llvm/Support/MemoryBuffer.h b/llvm/include/llvm/Support/MemoryBuffer.h index 9e6ee2536c5e..eccb7ee01e6f 100644 --- a/llvm/include/llvm/Support/MemoryBuffer.h +++ b/llvm/include/llvm/Support/MemoryBuffer.h @@ -82,9 +82,13 @@ public: /// \param IsVolatile Set to true to indicate that the contents of the file /// can change outside the user's control, e.g. when libclang tries to parse /// while the user is editing/updating the file or if the file is on an NFS. + /// + /// \param IsText Set to true to indicate that the file should be read in + /// text mode. static ErrorOr> getFile(const Twine &Filename, int64_t FileSize = -1, - bool RequiresNullTerminator = true, bool IsVolatile = false); + bool RequiresNullTerminator = true, bool IsVolatile = false, + bool IsText = false); /// Read all of the specified file into a MemoryBuffer as a stream /// (i.e. until EOF reached). This is useful for special files that @@ -130,7 +134,7 @@ public: /// is "-". static ErrorOr> getFileOrSTDIN(const Twine &Filename, int64_t FileSize = -1, - bool RequiresNullTerminator = true); + bool RequiresNullTerminator = true, bool IsText = false); /// Map a subrange of the specified file as a MemoryBuffer. static ErrorOr> diff --git a/llvm/lib/IRReader/IRReader.cpp b/llvm/lib/IRReader/IRReader.cpp index e7fd835f8ad0..69757a5f136b 100644 --- a/llvm/lib/IRReader/IRReader.cpp +++ b/llvm/lib/IRReader/IRReader.cpp @@ -92,7 +92,9 @@ std::unique_ptr llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context, DataLayoutCallbackTy DataLayoutCallback) { ErrorOr> FileOrErr = - MemoryBuffer::getFileOrSTDIN(Filename); + MemoryBuffer::getFileOrSTDIN(Filename, /*FileSize*/ -1, + /*RequiresNullTerminator*/ true, + /*IsText*/ true); if (std::error_code EC = FileOrErr.getError()) { Err = SMDiagnostic(Filename, SourceMgr::DK_Error, "Could not open input file: " + EC.message()); diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index a05b7d8ddd0e..955bf113fd79 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -106,7 +106,8 @@ public: template static ErrorOr> getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, - uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile); + uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile, + bool IsText); std::unique_ptr MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName, @@ -141,20 +142,20 @@ MemoryBuffer::getMemBufferCopy(StringRef InputData, const Twine &BufferName) { ErrorOr> MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize, - bool RequiresNullTerminator) { + bool RequiresNullTerminator, bool IsText) { SmallString<256> NameBuf; StringRef NameRef = Filename.toStringRef(NameBuf); if (NameRef == "-") return getSTDIN(); - return getFile(Filename, FileSize, RequiresNullTerminator); + return getFile(Filename, FileSize, RequiresNullTerminator, false, IsText); } ErrorOr> MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize, uint64_t Offset, bool IsVolatile) { return getFileAux(FilePath, -1, MapSize, Offset, false, - IsVolatile); + IsVolatile, false); } //===----------------------------------------------------------------------===// @@ -240,12 +241,12 @@ getMemoryBufferForStream(sys::fs::file_t FD, const Twine &BufferName) { return getMemBufferCopyImpl(Buffer, BufferName); } - ErrorOr> MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize, - bool RequiresNullTerminator, bool IsVolatile) { + bool RequiresNullTerminator, bool IsVolatile, + bool IsText) { return getFileAux(Filename, FileSize, FileSize, 0, - RequiresNullTerminator, IsVolatile); + RequiresNullTerminator, IsVolatile, IsText); } template @@ -257,9 +258,10 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, template static ErrorOr> getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, - uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile) { - Expected FDOrErr = - sys::fs::openNativeFileForRead(Filename, sys::fs::OF_None); + uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile, + bool IsText) { + Expected FDOrErr = sys::fs::openNativeFileForRead( + Filename, IsText ? sys::fs::OF_Text : sys::fs::OF_None); if (!FDOrErr) return errorToErrorCode(FDOrErr.takeError()); sys::fs::file_t FD = *FDOrErr; @@ -274,14 +276,14 @@ WritableMemoryBuffer::getFile(const Twine &Filename, int64_t FileSize, bool IsVolatile) { return getFileAux(Filename, FileSize, FileSize, 0, /*RequiresNullTerminator*/ false, - IsVolatile); + IsVolatile, false); } ErrorOr> WritableMemoryBuffer::getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset, bool IsVolatile) { return getFileAux(Filename, -1, MapSize, Offset, false, - IsVolatile); + IsVolatile, false); } std::unique_ptr diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp index ef223ae5ac1d..f49affb3fa99 100644 --- a/llvm/lib/Support/Path.cpp +++ b/llvm/lib/Support/Path.cpp @@ -167,8 +167,8 @@ enum FSEntity { static std::error_code createUniqueEntity(const Twine &Model, int &ResultFD, SmallVectorImpl &ResultPath, bool MakeAbsolute, - unsigned Mode, FSEntity Type, - sys::fs::OpenFlags Flags = sys::fs::OF_None) { + FSEntity Type, sys::fs::OpenFlags Flags = sys::fs::OF_None, + unsigned Mode = 0) { // Limit the number of attempts we make, so that we don't infinite loop. E.g. // "permission denied" could be for a specific file (so we retry with a @@ -816,22 +816,16 @@ void createUniquePath(const Twine &Model, SmallVectorImpl &ResultPath, std::error_code createUniqueFile(const Twine &Model, int &ResultFd, SmallVectorImpl &ResultPath, - unsigned Mode) { - return createUniqueEntity(Model, ResultFd, ResultPath, false, Mode, FS_File); -} - -static std::error_code createUniqueFile(const Twine &Model, int &ResultFd, - SmallVectorImpl &ResultPath, - unsigned Mode, OpenFlags Flags) { - return createUniqueEntity(Model, ResultFd, ResultPath, false, Mode, FS_File, - Flags); + OpenFlags Flags, unsigned Mode) { + return createUniqueEntity(Model, ResultFd, ResultPath, false, FS_File, Flags, + Mode); } std::error_code createUniqueFile(const Twine &Model, SmallVectorImpl &ResultPath, unsigned Mode) { int FD; - auto EC = createUniqueFile(Model, FD, ResultPath, Mode); + auto EC = createUniqueFile(Model, FD, ResultPath, OF_None, Mode); if (EC) return EC; // FD is only needed to avoid race conditions. Close it right away. @@ -841,34 +835,39 @@ std::error_code createUniqueFile(const Twine &Model, static std::error_code createTemporaryFile(const Twine &Model, int &ResultFD, - llvm::SmallVectorImpl &ResultPath, FSEntity Type) { + llvm::SmallVectorImpl &ResultPath, FSEntity Type, + sys::fs::OpenFlags Flags = sys::fs::OF_None) { SmallString<128> Storage; StringRef P = Model.toNullTerminatedStringRef(Storage); assert(P.find_first_of(separators(Style::native)) == StringRef::npos && "Model must be a simple filename."); // Use P.begin() so that createUniqueEntity doesn't need to recreate Storage. - return createUniqueEntity(P.begin(), ResultFD, ResultPath, true, - owner_read | owner_write, Type); + return createUniqueEntity(P.begin(), ResultFD, ResultPath, true, Type, Flags, + owner_read | owner_write); } static std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix, int &ResultFD, - llvm::SmallVectorImpl &ResultPath, FSEntity Type) { + llvm::SmallVectorImpl &ResultPath, FSEntity Type, + sys::fs::OpenFlags Flags = sys::fs::OF_None) { const char *Middle = Suffix.empty() ? "-%%%%%%" : "-%%%%%%."; return createTemporaryFile(Prefix + Middle + Suffix, ResultFD, ResultPath, - Type); + Type, Flags); } std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix, int &ResultFD, - SmallVectorImpl &ResultPath) { - return createTemporaryFile(Prefix, Suffix, ResultFD, ResultPath, FS_File); + SmallVectorImpl &ResultPath, + sys::fs::OpenFlags Flags) { + return createTemporaryFile(Prefix, Suffix, ResultFD, ResultPath, FS_File, + Flags); } std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix, - SmallVectorImpl &ResultPath) { + SmallVectorImpl &ResultPath, + sys::fs::OpenFlags Flags) { int FD; - auto EC = createTemporaryFile(Prefix, Suffix, FD, ResultPath); + auto EC = createTemporaryFile(Prefix, Suffix, FD, ResultPath, Flags); if (EC) return EC; // FD is only needed to avoid race conditions. Close it right away. @@ -876,13 +875,12 @@ std::error_code createTemporaryFile(const Twine &Prefix, StringRef Suffix, return EC; } - // This is a mkdtemp with a different pattern. We use createUniqueEntity mostly // for consistency. We should try using mkdtemp. std::error_code createUniqueDirectory(const Twine &Prefix, SmallVectorImpl &ResultPath) { int Dummy; - return createUniqueEntity(Prefix + "-%%%%%%", Dummy, ResultPath, true, 0, + return createUniqueEntity(Prefix + "-%%%%%%", Dummy, ResultPath, true, FS_Dir); } @@ -890,7 +888,7 @@ std::error_code getPotentiallyUniqueFileName(const Twine &Model, SmallVectorImpl &ResultPath) { int Dummy; - return createUniqueEntity(Model, Dummy, ResultPath, false, 0, FS_Name); + return createUniqueEntity(Model, Dummy, ResultPath, false, FS_Name); } std::error_code @@ -1279,7 +1277,7 @@ Expected TempFile::create(const Twine &Model, unsigned Mode) { int FD; SmallString<128> ResultPath; if (std::error_code EC = - createUniqueFile(Model, FD, ResultPath, Mode, OF_Delete)) + createUniqueFile(Model, FD, ResultPath, OF_Delete, Mode)) return errorCodeToError(EC); TempFile Ret(ResultPath, FD); diff --git a/llvm/lib/Support/ToolOutputFile.cpp b/llvm/lib/Support/ToolOutputFile.cpp index c2ca97a59c62..3735aac79e2f 100644 --- a/llvm/lib/Support/ToolOutputFile.cpp +++ b/llvm/lib/Support/ToolOutputFile.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ToolOutputFile.h" +#include "llvm/ADT/Triple.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Signals.h" using namespace llvm; @@ -45,7 +46,12 @@ ToolOutputFile::ToolOutputFile(StringRef Filename, std::error_code &EC, EC = std::error_code(); return; } - OSHolder.emplace(Filename, EC, Flags); + + // On Windows, we set the OF_None flag even for text files to avoid + // CRLF translation. + OSHolder.emplace( + Filename, EC, + llvm::Triple(LLVM_HOST_TRIPLE).isOSWindows() ? sys::fs::OF_None : Flags); OS = OSHolder.getPointer(); // If open fails, no cleanup is needed. if (EC) diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp index 0ace5363dd05..75f4d423d4d2 100644 --- a/llvm/lib/TableGen/Main.cpp +++ b/llvm/lib/TableGen/Main.cpp @@ -70,7 +70,7 @@ static int createDependencyFile(const TGParser &Parser, const char *argv0) { return reportError(argv0, "the option -d must be used together with -o\n"); std::error_code EC; - ToolOutputFile DepOut(DependFilename, EC, sys::fs::OF_None); + ToolOutputFile DepOut(DependFilename, EC, sys::fs::OF_Text); if (EC) return reportError(argv0, "error opening " + DependFilename + ":" + EC.message() + "\n"); @@ -93,7 +93,7 @@ int llvm::TableGenMain(const char *argv0, TableGenMainFn *MainFn) { Records.startTimer("Parse, build records"); ErrorOr> FileOrErr = - MemoryBuffer::getFileOrSTDIN(InputFilename); + MemoryBuffer::getFileOrSTDIN(InputFilename, -1, true, true); if (std::error_code EC = FileOrErr.getError()) return reportError(argv0, "Could not open input file '" + InputFilename + "': " + EC.message() + "\n"); @@ -137,13 +137,14 @@ int llvm::TableGenMain(const char *argv0, TableGenMainFn *MainFn) { // Only updates the real output file if there are any differences. // This prevents recompilation of all the files depending on it if there // aren't any. - if (auto ExistingOrErr = MemoryBuffer::getFile(OutputFilename)) + if (auto ExistingOrErr = + MemoryBuffer::getFile(OutputFilename, -1, true, false, true)) if (std::move(ExistingOrErr.get())->getBuffer() == Out.str()) WriteFile = false; } if (WriteFile) { std::error_code EC; - ToolOutputFile OutFile(OutputFilename, EC, sys::fs::OF_None); + ToolOutputFile OutFile(OutputFilename, EC, sys::fs::OF_Text); if (EC) return reportError(argv0, "error opening " + OutputFilename + ": " + EC.message() + "\n"); diff --git a/llvm/utils/FileCheck/FileCheck.cpp b/llvm/utils/FileCheck/FileCheck.cpp index 668dd9844400..0ee105d0bf46 100644 --- a/llvm/utils/FileCheck/FileCheck.cpp +++ b/llvm/utils/FileCheck/FileCheck.cpp @@ -821,7 +821,9 @@ int main(int argc, char **argv) { // Read the expected strings from the check file. ErrorOr> CheckFileOrErr = - MemoryBuffer::getFileOrSTDIN(CheckFilename); + MemoryBuffer::getFileOrSTDIN(CheckFilename, /*FileSize*/ -1, + /*RequiresNullTerminator*/ true, + /*IsText*/ true); if (std::error_code EC = CheckFileOrErr.getError()) { errs() << "Could not open check file '" << CheckFilename << "': " << EC.message() << '\n'; @@ -843,7 +845,9 @@ int main(int argc, char **argv) { // Open the file to check and add it to SourceMgr. ErrorOr> InputFileOrErr = - MemoryBuffer::getFileOrSTDIN(InputFilename); + MemoryBuffer::getFileOrSTDIN(InputFilename, /*FileSize*/ -1, + /*RequiresNullTerminator*/ true, + /*IsText*/ true); if (InputFilename == "-") InputFilename = ""; // Overwrite for improved diagnostic messages if (std::error_code EC = InputFileOrErr.getError()) { -- GitLab From a8697c57fa994ebb9524d837ba1ebe7ab00bfb6e Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Fri, 19 Mar 2021 06:24:42 -0500 Subject: [PATCH 0138/1000] [PowerPC] Fix the check for 16-bit signed field in peephole When a D-Form instruction is fed by an add-immediate, we attempt to merge the two immediates to form a single displacement so we can remove the add-immediate. However, we don't check whether the new displacement fits into a 16-bit signed immediate field early enough. Namely, we do a sign-extend from 16 bits first which will discard high bits and then we check whether the result is a 16-bit signed immediate. It of course will always be. Move the check prior to the sign extend to ensure we are checking the correct value. Fixes https://bugs.llvm.org/show_bug.cgi?id=49640 --- llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 14 ++-- .../CodeGen/PowerPC/out-of-range-dform.ll | 67 +++++++++++++++++++ 2 files changed, 72 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/PowerPC/out-of-range-dform.ll diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 4d0595689d9e..bc25b37452b1 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -4416,21 +4416,17 @@ bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO, // Sign-extend to 64-bits. // DefMI may be folded with another imm form instruction, the result Imm is // the sum of Imm of DefMI and BaseImm which is from imm form instruction. + APInt ActualValue(64, ImmMO.getImm() + BaseImm, true); + if (III.SignedImm && !ActualValue.isSignedIntN(III.ImmWidth)) + return false; + if (!III.SignedImm && !ActualValue.isIntN(III.ImmWidth)) + return false; Imm = SignExtend64<16>(ImmMO.getImm() + BaseImm); if (Imm % III.ImmMustBeMultipleOf) return false; if (III.TruncateImmTo) Imm &= ((1 << III.TruncateImmTo) - 1); - if (III.SignedImm) { - APInt ActualValue(64, Imm, true); - if (!ActualValue.isSignedIntN(III.ImmWidth)) - return false; - } else { - uint64_t UnsignedMax = (1 << III.ImmWidth) - 1; - if ((uint64_t)Imm > UnsignedMax) - return false; - } } else return false; diff --git a/llvm/test/CodeGen/PowerPC/out-of-range-dform.ll b/llvm/test/CodeGen/PowerPC/out-of-range-dform.ll new file mode 100644 index 000000000000..13b68a18ac79 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/out-of-range-dform.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ +; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \ +; RUN: -check-prefix=CHECK-P9 + +@_ZL3num = external dso_local unnamed_addr global float, align 4 + +define dso_local void @main() local_unnamed_addr personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-P9-LABEL: main: +; CHECK-P9: # %bb.0: # %bb +; CHECK-P9-NEXT: mflr r0 +; CHECK-P9-NEXT: std r0, 16(r1) +; CHECK-P9-NEXT: stdu r1, -32(r1) +; CHECK-P9-NEXT: .cfi_def_cfa_offset 32 +; CHECK-P9-NEXT: .cfi_offset lr, 16 +; CHECK-P9-NEXT: bl malloc +; CHECK-P9-NEXT: nop +; CHECK-P9-NEXT: addis r4, r2, _ZL3num@toc@ha +; CHECK-P9-NEXT: addi r3, r3, -25400 +; CHECK-P9-NEXT: lfs f0, _ZL3num@toc@l(r4) +; CHECK-P9-NEXT: addis r4, r2, .LCPI0_0@toc@ha +; CHECK-P9-NEXT: lfs f1, .LCPI0_0@toc@l(r4) +; CHECK-P9-NEXT: li r4, 0 +; CHECK-P9-NEXT: xsmulsp f0, f0, f1 +; CHECK-P9-NEXT: cmpldi r4, 0 +; CHECK-P9-NEXT: beq- cr0, .LBB0_2 +; CHECK-P9-NEXT: .p2align 5 +; CHECK-P9-NEXT: .LBB0_1: # %bb5 +; CHECK-P9-NEXT: # +; CHECK-P9-NEXT: addi r3, r3, 25400 +; CHECK-P9-NEXT: addi r4, r4, 25400 +; CHECK-P9-NEXT: stfs f0, 15240(r3) +; CHECK-P9-NEXT: cmpldi r4, 0 +; CHECK-P9-NEXT: bne+ cr0, .LBB0_1 +; CHECK-P9-NEXT: .LBB0_2: # %bb16 +bb: + %i = tail call noalias dereferenceable_or_null(6451600) i8* @malloc() + %i1 = bitcast i8* %i to float* + br label %bb2 + +bb2: ; preds = %bb5, %bb + %i3 = phi i64 [ 0, %bb ], [ %i15, %bb5 ] + %i4 = icmp eq i64 %i3, 0 + br i1 %i4, label %bb16, label %bb5 + +bb5: ; preds = %bb2 + %i6 = mul nuw nsw i64 %i3, 1270 + %i7 = add nuw nsw i64 %i6, 0 + %i8 = getelementptr inbounds float, float* %i1, i64 %i7 + store float undef, float* %i8, align 4 + %i9 = add nuw nsw i64 %i3, 3 + %i10 = load float, float* @_ZL3num, align 4 + %i11 = fmul float %i10, 0x3E00000000000000 + %i12 = mul nuw nsw i64 %i9, 1270 + %i13 = add nuw nsw i64 %i12, 0 + %i14 = getelementptr inbounds float, float* %i1, i64 %i13 + store float %i11, float* %i14, align 4 + %i15 = add nuw nsw i64 %i3, 5 + br label %bb2 + +bb16: ; preds = %bb2 + unreachable +} + +declare i32 @__gxx_personality_v0(...) + +declare i8* @malloc() local_unnamed_addr -- GitLab From fa4e72971e05e3c923e11a31e2025361e3425a8b Mon Sep 17 00:00:00 2001 From: Aaron Ballman Date: Fri, 19 Mar 2021 08:33:27 -0400 Subject: [PATCH 0139/1000] Automate common diagnostic checking for statement attributes Clang currently automates a fair amount of diagnostic checking for declaration attributes based on the declarations in Attr.td. It checks for things like subject appertainment, number of arguments, language options, etc. This patch uses the same machinery to perform diagnostic checking on statement attributes. --- clang/include/clang/Basic/Attr.td | 15 +- clang/include/clang/Sema/ParsedAttr.h | 7 + clang/include/clang/Sema/Sema.h | 7 + clang/lib/Sema/ParsedAttr.cpp | 4 + clang/lib/Sema/SemaAttr.cpp | 48 ++++++ clang/lib/Sema/SemaDeclAttr.cpp | 47 +----- clang/lib/Sema/SemaStmtAttr.cpp | 74 ++++----- clang/lib/Sema/SemaType.cpp | 2 - .../dcl.attr/dcl.attr.fallthrough/p1.cpp | 2 +- clang/test/Parser/stmt-attributes.c | 2 +- clang/test/Sema/c2x-fallthrough.c | 2 +- .../SemaCXX/switch-implicit-fallthrough.cpp | 6 +- clang/utils/TableGen/ClangAttrEmitter.cpp | 157 +++++++++++++----- 13 files changed, 237 insertions(+), 136 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 6b50894512cd..c7b68856aab0 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1183,9 +1183,9 @@ def OpenCLKernel : InheritableAttr { def OpenCLUnrollHint : StmtAttr { let Spellings = [GNU<"opencl_unroll_hint">]; -// let Subjects = SubjectList<[ForStmt, CXXForRangeStmt, WhileStmt, DoStmt], -// ErrorDiag, "'for', 'while', and 'do' statements">; - let Args = [UnsignedArgument<"UnrollHint">]; + let Subjects = SubjectList<[ForStmt, CXXForRangeStmt, WhileStmt, DoStmt], + ErrorDiag, "'for', 'while', and 'do' statements">; + let Args = [UnsignedArgument<"UnrollHint", /*opt*/1>]; let Documentation = [OpenCLUnrollHintDocs]; } @@ -1326,7 +1326,10 @@ def FallThrough : StmtAttr { let Spellings = [CXX11<"", "fallthrough", 201603>, C2x<"", "fallthrough", 201904>, CXX11<"clang", "fallthrough">, GCC<"fallthrough">]; -// let Subjects = [NullStmt]; + // The attribute only applies to a NullStmt, but we have special fix-it + // behavior if applied to a case label. + let Subjects = SubjectList<[NullStmt, SwitchCase], ErrorDiag, + "empty statements">; let Documentation = [FallthroughDocs]; } @@ -1344,7 +1347,8 @@ def NoMerge : DeclOrStmtAttr { let Spellings = [Clang<"nomerge">]; let Documentation = [NoMergeDocs]; let InheritEvenIfAlreadyPresent = 1; - let Subjects = SubjectList<[Function], ErrorDiag, "functions and statements">; + let Subjects = SubjectList<[Function, Stmt], ErrorDiag, + "functions and statements">; let SimpleHandler = 1; } @@ -3467,6 +3471,7 @@ def LoopHint : Attr { }]; let Documentation = [LoopHintDocs, UnrollHintDocs]; + let HasCustomParsing = 1; } def CapturedRecord : InheritableAttr { diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index 0d731d9150a8..a3d82fcd84f7 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -39,6 +39,7 @@ class IdentifierInfo; class LangOptions; class ParsedAttr; class Sema; +class Stmt; class TargetInfo; struct ParsedAttrInfo { @@ -80,6 +81,11 @@ struct ParsedAttrInfo { const Decl *D) const { return true; } + /// Check if this attribute appertains to St, and issue a diagnostic if not. + virtual bool diagAppertainsToStmt(Sema &S, const ParsedAttr &Attr, + const Stmt *St) const { + return true; + } /// Check if this attribute is allowed by the language we are compiling, and /// issue a diagnostic if not. virtual bool diagLangOpts(Sema &S, const ParsedAttr &Attr) const { @@ -592,6 +598,7 @@ public: unsigned getMaxArgs() const; bool hasVariadicArg() const; bool diagnoseAppertainsTo(class Sema &S, const Decl *D) const; + bool diagnoseAppertainsTo(class Sema &S, const Stmt *St) const; bool appliesToDecl(const Decl *D, attr::SubjectMatchRule MatchRule) const; void getMatchRules(const LangOptions &LangOpts, SmallVectorImpl> diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index b144587650eb..6fae208f74e7 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -4260,6 +4260,13 @@ public: void checkUnusedDeclAttributes(Declarator &D); + /// Handles semantic checking for features that are common to all attributes, + /// such as checking whether a parameter was properly specified, or the + /// correct number of arguments were passed, etc. Returns true if the + /// attribute has been diagnosed. + bool checkCommonAttributeFeatures(const Decl *D, const ParsedAttr &A); + bool checkCommonAttributeFeatures(const Stmt *S, const ParsedAttr &A); + /// Determine if type T is a valid subject for a nonnull and similar /// attributes. By default, we look through references (the behavior used by /// nonnull), but if the second parameter is true, then we treat a reference diff --git a/clang/lib/Sema/ParsedAttr.cpp b/clang/lib/Sema/ParsedAttr.cpp index c6a3d7c4342c..1ac7ed1afc4e 100644 --- a/clang/lib/Sema/ParsedAttr.cpp +++ b/clang/lib/Sema/ParsedAttr.cpp @@ -159,6 +159,10 @@ bool ParsedAttr::diagnoseAppertainsTo(Sema &S, const Decl *D) const { return getInfo().diagAppertainsToDecl(S, *this, D); } +bool ParsedAttr::diagnoseAppertainsTo(Sema &S, const Stmt *St) const { + return getInfo().diagAppertainsToStmt(S, *this, St); +} + bool ParsedAttr::appliesToDecl(const Decl *D, attr::SubjectMatchRule MatchRule) const { return checkAttributeMatchRuleAppliesTo(D, MatchRule); diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp index 9df2b7f84b57..2c37ccee1616 100644 --- a/clang/lib/Sema/SemaAttr.cpp +++ b/clang/lib/Sema/SemaAttr.cpp @@ -1188,3 +1188,51 @@ void Sema::PopPragmaVisibility(bool IsNamespaceEnd, SourceLocation EndLoc) { if (Stack->empty()) FreeVisContext(); } + +template +static bool checkCommonAttributeFeatures(Sema& S, const Ty *Node, + const ParsedAttr& A) { + // Several attributes carry different semantics than the parsing requires, so + // those are opted out of the common argument checks. + // + // We also bail on unknown and ignored attributes because those are handled + // as part of the target-specific handling logic. + if (A.getKind() == ParsedAttr::UnknownAttribute) + return false; + // Check whether the attribute requires specific language extensions to be + // enabled. + if (!A.diagnoseLangOpts(S)) + return true; + // Check whether the attribute appertains to the given subject. + if (!A.diagnoseAppertainsTo(S, Node)) + return true; + // Check whether the attribute exists in the target architecture. + if (S.CheckAttrTarget(A)) + return true; + + if (A.hasCustomParsing()) + return false; + + if (A.getMinArgs() == A.getMaxArgs()) { + // If there are no optional arguments, then checking for the argument count + // is trivial. + if (!A.checkExactlyNumArgs(S, A.getMinArgs())) + return true; + } else { + // There are optional arguments, so checking is slightly more involved. + if (A.getMinArgs() && !A.checkAtLeastNumArgs(S, A.getMinArgs())) + return true; + else if (!A.hasVariadicArg() && A.getMaxArgs() && + !A.checkAtMostNumArgs(S, A.getMaxArgs())) + return true; + } + + return false; +} + +bool Sema::checkCommonAttributeFeatures(const Decl *D, const ParsedAttr &A) { + return ::checkCommonAttributeFeatures(*this, D, A); +} +bool Sema::checkCommonAttributeFeatures(const Stmt *S, const ParsedAttr &A) { + return ::checkCommonAttributeFeatures(*this, S, A); +} diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index d713c1ff1016..c4901042c042 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -7371,48 +7371,6 @@ static void handleOpenCLNoSVMAttr(Sema &S, Decl *D, const ParsedAttr &AL) { << "2.0"; } -/// Handles semantic checking for features that are common to all attributes, -/// such as checking whether a parameter was properly specified, or the correct -/// number of arguments were passed, etc. -static bool handleCommonAttributeFeatures(Sema &S, Decl *D, - const ParsedAttr &AL) { - // Several attributes carry different semantics than the parsing requires, so - // those are opted out of the common argument checks. - // - // We also bail on unknown and ignored attributes because those are handled - // as part of the target-specific handling logic. - if (AL.getKind() == ParsedAttr::UnknownAttribute) - return false; - // Check whether the attribute requires specific language extensions to be - // enabled. - if (!AL.diagnoseLangOpts(S)) - return true; - // Check whether the attribute appertains to the given subject. - if (!AL.diagnoseAppertainsTo(S, D)) - return true; - if (AL.hasCustomParsing()) - return false; - - if (AL.getMinArgs() == AL.getMaxArgs()) { - // If there are no optional arguments, then checking for the argument count - // is trivial. - if (!AL.checkExactlyNumArgs(S, AL.getMinArgs())) - return true; - } else { - // There are optional arguments, so checking is slightly more involved. - if (AL.getMinArgs() && !AL.checkAtLeastNumArgs(S, AL.getMinArgs())) - return true; - else if (!AL.hasVariadicArg() && AL.getMaxArgs() && - !AL.checkAtMostNumArgs(S, AL.getMaxArgs())) - return true; - } - - if (S.CheckAttrTarget(AL)) - return true; - - return false; -} - static void handleOpenCLAccessAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (D->isInvalidDecl()) return; @@ -7766,7 +7724,7 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, return; } - if (handleCommonAttributeFeatures(S, D, AL)) + if (S.checkCommonAttributeFeatures(D, AL)) return; switch (AL.getKind()) { @@ -7778,6 +7736,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, assert(AL.isTypeAttr() && "Non-type attribute not handled"); break; } + // N.B., ClangAttrEmitter.cpp emits a diagnostic helper that ensures a + // statement attribute is not written on a declaration, but this code is + // needed for attributes in Attr.td that do not list any subjects. S.Diag(AL.getLoc(), diag::err_stmt_attribute_invalid_on_decl) << AL << D->getLocation(); break; diff --git a/clang/lib/Sema/SemaStmtAttr.cpp b/clang/lib/Sema/SemaStmtAttr.cpp index 86a09c42863f..cb90a03aa20e 100644 --- a/clang/lib/Sema/SemaStmtAttr.cpp +++ b/clang/lib/Sema/SemaStmtAttr.cpp @@ -26,14 +26,12 @@ using namespace sema; static Attr *handleFallThroughAttr(Sema &S, Stmt *St, const ParsedAttr &A, SourceRange Range) { FallThroughAttr Attr(S.Context, A); - if (!isa(St)) { + if (isa(St)) { S.Diag(A.getRange().getBegin(), diag::err_fallthrough_attr_wrong_target) - << Attr.getSpelling() << St->getBeginLoc(); - if (isa(St)) { - SourceLocation L = S.getLocForEndOfToken(Range.getEnd()); - S.Diag(L, diag::note_fallthrough_insert_semi_fixit) - << FixItHint::CreateInsertion(L, ";"); - } + << A << St->getBeginLoc(); + SourceLocation L = S.getLocForEndOfToken(Range.getEnd()); + S.Diag(L, diag::note_fallthrough_insert_semi_fixit) + << FixItHint::CreateInsertion(L, ";"); return nullptr; } auto *FnScope = S.getCurFunction(); @@ -54,11 +52,6 @@ static Attr *handleFallThroughAttr(Sema &S, Stmt *St, const ParsedAttr &A, static Attr *handleSuppressAttr(Sema &S, Stmt *St, const ParsedAttr &A, SourceRange Range) { - if (A.getNumArgs() < 1) { - S.Diag(A.getLoc(), diag::err_attribute_too_few_arguments) << A << 1; - return nullptr; - } - std::vector DiagnosticIdentifiers; for (unsigned I = 0, E = A.getNumArgs(); I != E; ++I) { StringRef RuleName; @@ -88,10 +81,10 @@ static Attr *handleLoopHintAttr(Sema &S, Stmt *St, const ParsedAttr &A, PragmaNameLoc->Ident->getName()) .Default("clang loop"); - if (St->getStmtClass() != Stmt::DoStmtClass && - St->getStmtClass() != Stmt::ForStmtClass && - St->getStmtClass() != Stmt::CXXForRangeStmtClass && - St->getStmtClass() != Stmt::WhileStmtClass) { + // This could be handled automatically by adding a Subjects definition in + // Attr.td, but that would make the diagnostic behavior worse in this case + // because the user spells this attribute as a pragma. + if (!isa(St)) { std::string Pragma = "#pragma " + std::string(PragmaName); S.Diag(St->getBeginLoc(), diag::err_pragma_loop_precedes_nonloop) << Pragma; return nullptr; @@ -205,9 +198,6 @@ public: static Attr *handleNoMergeAttr(Sema &S, Stmt *St, const ParsedAttr &A, SourceRange Range) { NoMergeAttr NMA(S.Context, A); - if (S.CheckAttrNoArgs(A)) - return nullptr; - CallExprFinder CEF(S, St); if (!CEF.foundCallExpr()) { @@ -377,23 +367,8 @@ static Attr *handleOpenCLUnrollHint(Sema &S, Stmt *St, const ParsedAttr &A, // opencl_unroll_hint can have 0 arguments (compiler // determines unrolling factor) or 1 argument (the unroll factor provided // by the user). - - if (!isa(St)) { - S.Diag(A.getLoc(), diag::err_attribute_wrong_decl_type_str) - << A << "'for', 'while', and 'do' statements"; - return nullptr; - } - - unsigned NumArgs = A.getNumArgs(); - - if (NumArgs > 1) { - S.Diag(A.getLoc(), diag::err_attribute_too_many_arguments) << A << 1; - return nullptr; - } - unsigned UnrollFactor = 0; - - if (NumArgs == 1) { + if (A.getNumArgs() == 1) { Expr *E = A.getArgAsExpr(0); Optional ArgVal; @@ -404,28 +379,42 @@ static Attr *handleOpenCLUnrollHint(Sema &S, Stmt *St, const ParsedAttr &A, } int Val = ArgVal->getSExtValue(); - if (Val <= 0) { S.Diag(A.getRange().getBegin(), diag::err_attribute_requires_positive_integer) << A << /* positive */ 0; return nullptr; } - UnrollFactor = Val; + UnrollFactor = static_cast(Val); } - return OpenCLUnrollHintAttr::CreateImplicit(S.Context, UnrollFactor); + return ::new (S.Context) OpenCLUnrollHintAttr(S.Context, A, UnrollFactor); } static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A, SourceRange Range) { - switch (A.getKind()) { - case ParsedAttr::UnknownAttribute: + if (A.isInvalid() || A.getKind() == ParsedAttr::IgnoredAttribute) + return nullptr; + + // Unknown attributes are automatically warned on. Target-specific attributes + // which do not apply to the current target architecture are treated as + // though they were unknown attributes. + const TargetInfo *Aux = S.Context.getAuxTargetInfo(); + if (A.getKind() == ParsedAttr::UnknownAttribute || + !(A.existsInTarget(S.Context.getTargetInfo()) || + (S.Context.getLangOpts().SYCLIsDevice && Aux && + A.existsInTarget(*Aux)))) { S.Diag(A.getLoc(), A.isDeclspecAttribute() ? (unsigned)diag::warn_unhandled_ms_attribute_ignored : (unsigned)diag::warn_unknown_attribute_ignored) << A << A.getRange(); return nullptr; + } + + if (S.checkCommonAttributeFeatures(St, A)) + return nullptr; + + switch (A.getKind()) { case ParsedAttr::AT_FallThrough: return handleFallThroughAttr(S, St, A, Range); case ParsedAttr::AT_LoopHint: @@ -441,8 +430,9 @@ static Attr *ProcessStmtAttribute(Sema &S, Stmt *St, const ParsedAttr &A, case ParsedAttr::AT_Unlikely: return handleUnlikely(S, St, A, Range); default: - // if we're here, then we parsed a known attribute, but didn't recognize - // it as a statement attribute => it is declaration attribute + // N.B., ClangAttrEmitter.cpp emits a diagnostic helper that ensures a + // declaration attribute is not written on a statement, but this code is + // needed for attributes in Attr.td that do not list any subjects. S.Diag(A.getRange().getBegin(), diag::err_decl_attribute_invalid_on_stmt) << A << St->getBeginLoc(); return nullptr; diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index ffd431608b82..97971b300981 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -7968,8 +7968,6 @@ static void HandleLifetimeBoundAttr(TypeProcessingState &State, CurType = State.getAttributedType( createSimpleAttr(State.getSema().Context, Attr), CurType, CurType); - } else { - Attr.diagnoseAppertainsTo(State.getSema(), nullptr); } } diff --git a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp index f267d9067bcc..22815bbde9db 100644 --- a/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp +++ b/clang/test/CXX/dcl.dcl/dcl.attr/dcl.attr.fallthrough/p1.cpp @@ -53,7 +53,7 @@ class [[fallthrough]] C {}; // expected-error {{'fallthrough' attribute cannot b [[fallthrough]] // expected-error {{'fallthrough' attribute cannot be applied to a declaration}} void g() { [[fallthrough]] int n; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}} - [[fallthrough]] ++n; // expected-error-re {{{{^}}fallthrough attribute is only allowed on empty statements}} + [[fallthrough]] ++n; // expected-error {{'fallthrough' attribute only applies to empty statements}} switch (n) { // FIXME: This should be an error. diff --git a/clang/test/Parser/stmt-attributes.c b/clang/test/Parser/stmt-attributes.c index d142ce1b5b95..86adc56f40ca 100644 --- a/clang/test/Parser/stmt-attributes.c +++ b/clang/test/Parser/stmt-attributes.c @@ -40,7 +40,7 @@ void foo(int i) { __attribute__((unused)) switch (i) { // expected-error {{'unused' attribute cannot be applied to a statement}} __attribute__((uuid)) case 0: // expected-warning {{unknown attribute 'uuid' ignored}} - __attribute__((visibility)) default: // expected-error {{'visibility' attribute cannot be applied to a statement}} + __attribute__((visibility(""))) default: // expected-error {{'visibility' attribute cannot be applied to a statement}} __attribute__((carries_dependency)) break; // expected-error {{'carries_dependency' attribute cannot be applied to a statement}} } diff --git a/clang/test/Sema/c2x-fallthrough.c b/clang/test/Sema/c2x-fallthrough.c index 2fd69c4da0f2..e5508e0a10f1 100644 --- a/clang/test/Sema/c2x-fallthrough.c +++ b/clang/test/Sema/c2x-fallthrough.c @@ -57,7 +57,7 @@ struct [[fallthrough]] S { // expected-error {{'fallthrough' attribute cannot be [[fallthrough]] // expected-error {{'fallthrough' attribute cannot be applied to a declaration}} void g(void) { [[fallthrough]] int n; // expected-error {{'fallthrough' attribute cannot be applied to a declaration}} - [[fallthrough]] ++n; // expected-error-re {{{{^}}fallthrough attribute is only allowed on empty statements}} + [[fallthrough]] ++n; // expected-error {{'fallthrough' attribute only applies to empty statements}} switch (n) { // FIXME: This should be an error. diff --git a/clang/test/SemaCXX/switch-implicit-fallthrough.cpp b/clang/test/SemaCXX/switch-implicit-fallthrough.cpp index a67f6bef1f49..e6ae0d55b588 100644 --- a/clang/test/SemaCXX/switch-implicit-fallthrough.cpp +++ b/clang/test/SemaCXX/switch-implicit-fallthrough.cpp @@ -299,16 +299,16 @@ int fallthrough_placement_error(int n) { int fallthrough_targets(int n) { [[clang::fallthrough]]; // expected-error{{fallthrough annotation is outside switch statement}} - [[clang::fallthrough]] // expected-error{{fallthrough attribute is only allowed on empty statements}} + [[clang::fallthrough]] // expected-error{{'fallthrough' attribute only applies to empty statements}} switch (n) { case 121: n += 400; [[clang::fallthrough]]; // no warning here, correct target case 123: - [[clang::fallthrough]] // expected-error{{fallthrough attribute is only allowed on empty statements}} + [[clang::fallthrough]] // expected-error{{'fallthrough' attribute only applies to empty statements}} n += 800; break; - [[clang::fallthrough]] // expected-error{{fallthrough attribute is only allowed on empty statements}} expected-note{{did you forget ';'?}} + [[clang::fallthrough]] // expected-error{{'fallthrough' attribute is only allowed on empty statements}} expected-note{{did you forget ';'?}} case 125: n += 1600; } diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index aaef538e9bf9..e74df36899d4 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -1828,6 +1828,22 @@ struct PragmaClangAttributeSupport { } // end anonymous namespace +static bool isSupportedPragmaClangAttributeSubject(const Record &Subject) { + // FIXME: #pragma clang attribute does not currently support statement + // attributes, so test whether the subject is one that appertains to a + // declaration node. However, it may be reasonable for support for statement + // attributes to be added. + if (Subject.isSubClassOf("DeclNode") || Subject.isSubClassOf("DeclBase") || + Subject.getName() == "DeclBase") + return true; + + if (Subject.isSubClassOf("SubsetSubject")) + return isSupportedPragmaClangAttributeSubject( + *Subject.getValueAsDef("Base")); + + return false; +} + static bool doesDeclDeriveFrom(const Record *D, const Record *Base) { const Record *CurrentBase = D->getValueAsOptionalDef(BaseFieldName); if (!CurrentBase) @@ -1949,13 +1965,15 @@ bool PragmaClangAttributeSupport::isAttributedSupported( return false; const Record *SubjectObj = Attribute.getValueAsDef("Subjects"); std::vector Subjects = SubjectObj->getValueAsListOfDefs("Subjects"); - if (Subjects.empty()) - return false; + bool HasAtLeastOneValidSubject = false; for (const auto *Subject : Subjects) { + if (!isSupportedPragmaClangAttributeSubject(*Subject)) + continue; if (SubjectsToRules.find(Subject) == SubjectsToRules.end()) return false; + HasAtLeastOneValidSubject = true; } - return true; + return HasAtLeastOneValidSubject; } static std::string GenerateTestExpression(ArrayRef LangOpts) { @@ -2001,6 +2019,8 @@ PragmaClangAttributeSupport::generateStrictConformsTo(const Record &Attr, const Record *SubjectObj = Attr.getValueAsDef("Subjects"); std::vector Subjects = SubjectObj->getValueAsListOfDefs("Subjects"); for (const auto *Subject : Subjects) { + if (!isSupportedPragmaClangAttributeSubject(*Subject)) + continue; auto It = SubjectsToRules.find(Subject); assert(It != SubjectsToRules.end() && "This attribute is unsupported by #pragma clang attribute"); @@ -3503,7 +3523,7 @@ static void GenerateAppertainsTo(const Record &Attr, raw_ostream &OS) { return; const Record *SubjectObj = Attr.getValueAsDef("Subjects"); - std::vector Subjects = SubjectObj->getValueAsListOfDefs("Subjects"); + std::vector Subjects = SubjectObj->getValueAsListOfDefs("Subjects"); // If the list of subjects is empty, it is assumed that the attribute // appertains to everything. @@ -3512,42 +3532,99 @@ static void GenerateAppertainsTo(const Record &Attr, raw_ostream &OS) { bool Warn = SubjectObj->getValueAsDef("Diag")->getValueAsBit("Warn"); - // Otherwise, generate an appertainsTo check specific to this attribute which - // checks all of the given subjects against the Decl passed in. - // - // If D is null, that means the attribute was not applied to a declaration - // at all (for instance because it was applied to a type), or that the caller - // has determined that the check should fail (perhaps prior to the creation - // of the declaration). - OS << "bool diagAppertainsToDecl(Sema &S, "; - OS << "const ParsedAttr &Attr, const Decl *D) const override {\n"; - OS << " if ("; - for (auto I = Subjects.begin(), E = Subjects.end(); I != E; ++I) { - // If the subject has custom code associated with it, use the generated - // function for it. The function cannot be inlined into this check (yet) - // because it requires the subject to be of a specific type, and were that - // information inlined here, it would not support an attribute with multiple - // custom subjects. - if ((*I)->isSubClassOf("SubsetSubject")) { - OS << "!" << functionNameForCustomAppertainsTo(**I) << "(D)"; - } else { - OS << "!isa<" << GetSubjectWithSuffix(*I) << ">(D)"; + // Split the subjects into declaration subjects and statement subjects. + // FIXME: subset subjects are added to the declaration list until there are + // enough statement attributes with custom subject needs to warrant + // the implementation effort. + std::vector DeclSubjects, StmtSubjects; + llvm::copy_if( + Subjects, std::back_inserter(DeclSubjects), [](const Record *R) { + return R->isSubClassOf("SubsetSubject") || !R->isSubClassOf("StmtNode"); + }); + llvm::copy_if(Subjects, std::back_inserter(StmtSubjects), + [](const Record *R) { return R->isSubClassOf("StmtNode"); }); + + // We should have sorted all of the subjects into two lists. + // FIXME: this assertion will be wrong if we ever add type attribute subjects. + assert(DeclSubjects.size() + StmtSubjects.size() == Subjects.size()); + + if (DeclSubjects.empty()) { + // If there are no decl subjects but there are stmt subjects, diagnose + // trying to apply a statement attribute to a declaration. + if (!StmtSubjects.empty()) { + OS << "bool diagAppertainsToDecl(Sema &S, const ParsedAttr &AL, "; + OS << "const Decl *D) const override {\n"; + OS << " S.Diag(AL.getLoc(), diag::err_stmt_attribute_invalid_on_decl)\n"; + OS << " << AL << D->getLocation();\n"; + OS << " return false;\n"; + OS << "}\n\n"; } + } else { + // Otherwise, generate an appertainsTo check specific to this attribute + // which checks all of the given subjects against the Decl passed in. + OS << "bool diagAppertainsToDecl(Sema &S, "; + OS << "const ParsedAttr &Attr, const Decl *D) const override {\n"; + OS << " if ("; + for (auto I = DeclSubjects.begin(), E = DeclSubjects.end(); I != E; ++I) { + // If the subject has custom code associated with it, use the generated + // function for it. The function cannot be inlined into this check (yet) + // because it requires the subject to be of a specific type, and were that + // information inlined here, it would not support an attribute with + // multiple custom subjects. + if ((*I)->isSubClassOf("SubsetSubject")) + OS << "!" << functionNameForCustomAppertainsTo(**I) << "(D)"; + else + OS << "!isa<" << GetSubjectWithSuffix(*I) << ">(D)"; - if (I + 1 != E) - OS << " && "; + if (I + 1 != E) + OS << " && "; + } + OS << ") {\n"; + OS << " S.Diag(Attr.getLoc(), diag::"; + OS << (Warn ? "warn_attribute_wrong_decl_type_str" + : "err_attribute_wrong_decl_type_str"); + OS << ")\n"; + OS << " << Attr << "; + OS << CalculateDiagnostic(*SubjectObj) << ";\n"; + OS << " return false;\n"; + OS << " }\n"; + OS << " return true;\n"; + OS << "}\n\n"; + } + + if (StmtSubjects.empty()) { + // If there are no stmt subjects but there are decl subjects, diagnose + // trying to apply a declaration attribute to a statement. + if (!DeclSubjects.empty()) { + OS << "bool diagAppertainsToStmt(Sema &S, const ParsedAttr &AL, "; + OS << "const Stmt *St) const override {\n"; + OS << " S.Diag(AL.getLoc(), diag::err_decl_attribute_invalid_on_stmt)\n"; + OS << " << AL << St->getBeginLoc();\n"; + OS << " return false;\n"; + OS << "}\n\n"; + } + } else { + // Now, do the same for statements. + OS << "bool diagAppertainsToStmt(Sema &S, "; + OS << "const ParsedAttr &Attr, const Stmt *St) const override {\n"; + OS << " if ("; + for (auto I = StmtSubjects.begin(), E = StmtSubjects.end(); I != E; ++I) { + OS << "!isa<" << (*I)->getName() << ">(St)"; + if (I + 1 != E) + OS << " && "; + } + OS << ") {\n"; + OS << " S.Diag(Attr.getLoc(), diag::"; + OS << (Warn ? "warn_attribute_wrong_decl_type_str" + : "err_attribute_wrong_decl_type_str"); + OS << ")\n"; + OS << " << Attr << "; + OS << CalculateDiagnostic(*SubjectObj) << ";\n"; + OS << " return false;\n"; + OS << " }\n"; + OS << " return true;\n"; + OS << "}\n\n"; } - OS << ") {\n"; - OS << " S.Diag(Attr.getLoc(), diag::"; - OS << (Warn ? "warn_attribute_wrong_decl_type_str" : - "err_attribute_wrong_decl_type_str"); - OS << ")\n"; - OS << " << Attr << "; - OS << CalculateDiagnostic(*SubjectObj) << ";\n"; - OS << " return false;\n"; - OS << " }\n"; - OS << " return true;\n"; - OS << "}\n\n"; } static void @@ -4214,9 +4291,13 @@ void EmitTestPragmaAttributeSupportedAttributes(RecordKeeper &Records, std::vector Subjects = SubjectObj->getValueAsListOfDefs("Subjects"); OS << " ("; + bool PrintComma = false; for (const auto &Subject : llvm::enumerate(Subjects)) { - if (Subject.index()) + if (!isSupportedPragmaClangAttributeSubject(*Subject.value())) + continue; + if (PrintComma) OS << ", "; + PrintComma = true; PragmaClangAttributeSupport::RuleOrAggregateRuleSet &RuleSet = Support.SubjectsToRules.find(Subject.value())->getSecond(); if (RuleSet.isRule()) { -- GitLab From a5f9cda17333530de3d78282d10f53abfaa00906 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Fri, 19 Mar 2021 10:11:51 +0100 Subject: [PATCH 0140/1000] [mlir] Rename gpu-to-llvm pass implementation file Also remove populate patterns function and binary annotation name option. Reviewed By: ftynse Differential Revision: https://reviews.llvm.org/D98930 --- .../mlir/Conversion/GPUCommon/GPUCommonPass.h | 15 ++----- mlir/lib/Conversion/GPUCommon/CMakeLists.txt | 2 +- ...ntimeCalls.cpp => GPUToLLVMConversion.cpp} | 44 ++++++++----------- 3 files changed, 22 insertions(+), 39 deletions(-) rename mlir/lib/Conversion/GPUCommon/{ConvertLaunchFuncToRuntimeCalls.cpp => GPUToLLVMConversion.cpp} (98%) diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h index 173d8feced35..878861e406e4 100644 --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -43,23 +43,14 @@ using BlobGenerator = using LoweringCallback = std::function( Operation *, llvm::LLVMContext &, StringRef)>; -/// Creates a pass to convert a gpu.launch_func operation into a sequence of -/// GPU runtime calls. +/// Creates a pass to convert a GPU operations into a sequence of GPU runtime +/// calls. /// /// This pass does not generate code to call GPU runtime APIs directly but /// instead uses a small wrapper library that exports a stable and conveniently /// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP). -/// -/// A non-empty gpuBinaryAnnotation overrides the pass' command line option. -std::unique_ptr> -createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation = {}); +std::unique_ptr> createGpuToLLVMConversionPass(); -/// Collect a set of patterns to convert from the GPU dialect to LLVM. -/// -/// A non-empty gpuBinaryAnnotation overrides the pass' command line option. -void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, - OwningRewritePatternList &patterns, - StringRef gpuBinaryAnnotation = {}); } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt index 04ff2a994091..65ad9de0b20c 100644 --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -15,7 +15,7 @@ if (MLIR_ROCM_CONVERSIONS_ENABLED) endif() add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms - ConvertLaunchFuncToRuntimeCalls.cpp + GPUToLLVMConversion.cpp GPUOpsLowering.cpp DEPENDS diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp similarity index 98% rename from mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp rename to mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp index 0e3bf166c47e..d490c5247a9f 100644 --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -41,10 +41,7 @@ namespace { class GpuToLLVMConversionPass : public GpuToLLVMConversionPassBase { public: - GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) { - if (!gpuBinaryAnnotation.empty()) - this->gpuBinaryAnnotation = gpuBinaryAnnotation.str(); - } + GpuToLLVMConversionPass() = default; GpuToLLVMConversionPass(const GpuToLLVMConversionPass &other) : GpuToLLVMConversionPassBase(other) {} @@ -318,7 +315,21 @@ void GpuToLLVMConversionPass::runOnOperation() { populateStdToLLVMConversionPatterns(converter, patterns); populateAsyncStructuralTypeConversionsAndLegality(&getContext(), converter, patterns, target); - populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation); + + converter.addConversion( + [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type { + return LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); + }); + patterns.insert(converter); + patterns.insert( + converter, gpuBinaryAnnotation); + patterns.insert(&converter.getContext()); if (failed( applyPartialConversion(getOperation(), target, std::move(patterns)))) @@ -784,25 +795,6 @@ LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite( } std::unique_ptr> -mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) { - return std::make_unique(gpuBinaryAnnotation); -} - -void mlir::populateGpuToLLVMConversionPatterns( - LLVMTypeConverter &converter, OwningRewritePatternList &patterns, - StringRef gpuBinaryAnnotation) { - converter.addConversion( - [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type { - return LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); - }); - patterns.insert(converter); - patterns.insert( - converter, gpuBinaryAnnotation); - patterns.insert(&converter.getContext()); +mlir::createGpuToLLVMConversionPass() { + return std::make_unique(); } -- GitLab From 04790d9cfba35073d56047544502c387c5657bb1 Mon Sep 17 00:00:00 2001 From: Jeroen Dobbelaere Date: Fri, 19 Mar 2021 14:34:25 +0100 Subject: [PATCH 0141/1000] Support intrinsic overloading on unnamed types This patch adds support for intrinsic overloading on unnamed types. This fixes PR38117 and PR48340 and will also be needed for the Full Restrict Patches (D68484). The main problem is that the intrinsic overloading name mangling is using 's_s' for unnamed types. This can result in identical intrinsic mangled names for different function prototypes. This patch changes this by adding a '.XXXXX' to the intrinsic mangled name when at least one of the types is based on an unnamed type, ensuring that we get a unique name. Implementation details: - The mapping is created on demand and kept in Module. - It also checks for existing clashes and recycles potentially existing prototypes and declarations. - Because of extra data in Module, Intrinsic::getName needs an extra Module* argument and, for speed, an optional FunctionType* argument. - I still kept the original two-argument 'Intrinsic::getName' around which keeps the original behavior (providing the base name). -- Main reason is that I did not want to change the LLVMIntrinsicGetName version, as I don't know how acceptable such a change is -- The current situation already has a limitation. So that should not get worse with this patch. - Intrinsic::getDeclaration and the verifier are now using the new version. Other notes: - As far as I see, this should not suffer from stability issues. The count is only added for prototypes depending on at least one anonymous struct - The initial count starts from 0 for each intrinsic mangled name. - In case of name clashes, existing prototypes are remembered and reused when that makes sense. Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D91250 --- llvm/docs/LangRef.rst | 14 ++- llvm/include/llvm/IR/Intrinsics.h | 19 +++- llvm/include/llvm/IR/Module.h | 13 +++ llvm/lib/IR/Function.cpp | 54 +++++++--- llvm/lib/IR/Module.cpp | 50 +++++++++ llvm/lib/IR/Verifier.cpp | 3 +- llvm/lib/Linker/IRMover.cpp | 19 +++- .../Bitcode/intrinsics-with-unnamed-types.ll | 31 ++++++ .../Linker/intrinsics-with-unnamed-types.ll | 101 ++++++++++++++++++ .../Transforms/LoopVectorize/X86/pr48340.ll | 54 ++++++++++ 10 files changed, 332 insertions(+), 26 deletions(-) create mode 100644 llvm/test/Bitcode/intrinsics-with-unnamed-types.ll create mode 100644 llvm/test/Linker/intrinsics-with-unnamed-types.ll create mode 100644 llvm/test/Transforms/LoopVectorize/X86/pr48340.ll diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index d53795ef5607..54fb8945324b 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3392,9 +3392,10 @@ Opaque Structure Types :Overview: -Opaque structure types are used to represent named structure types that +Opaque structure types are used to represent structure types that do not have a body specified. This corresponds (for example) to the C -notion of a forward declared structure. +notion of a forward declared structure. They can be named (``%X``) or +unnamed (``%52``). :Syntax: @@ -11507,6 +11508,15 @@ overloaded, and only one type suffix is required. Because the argument's type is matched against the return type, it does not require its own name suffix. +:ref:`Unnamed types ` are encoded as ``s_s``. Overloaded intrinsics +that depend on an unnamed type in one of its overloaded argument types get an +additional ``.`` suffix. This allows differentiating intrinsics with +different unnamed types as arguments. (For example: +``llvm.ssa.copy.p0s_s.2(%42*)``) The number is tracked in the LLVM module and +it ensures unique names in the module. While linking together two modules, it is +still possible to get a name clash. In that case one of the names will be +changed by getting a new number. + For target developers who are defining intrinsics for back-end code generation, any intrinsic overloads based solely the distinction between integer or floating point types should not be relied upon for correct diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index f9b6c098a3f2..ae84ee8f354a 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -56,11 +56,20 @@ namespace Intrinsic { StringRef getName(ID id); /// Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx". - /// Note, this version of getName supports overloads, but is less efficient - /// than the StringRef version of this function. If no overloads are - /// requried, it is safe to use this version, but better to use the StringRef - /// version. - std::string getName(ID id, ArrayRef Tys); + /// Note, this version of getName supports overloads, but not unnamed types. + /// It is less efficient than the StringRef version of this function. If no + /// overloads are required, it is safe to use this version, but better to use + /// the StringRef version. + std::string getName(ID Id, ArrayRef Tys); + + /// Return the LLVM name for an intrinsic, such as "llvm.ssa.copy.p0s_s.1". + /// Note, this version of getName supports overloads and unnamed types, but is + /// less efficient than the StringRef version of this function. If no + /// overloads are required, it is safe to use this version, but better to use + /// the StringRef version. A function type FT can be provided to avoid + /// computing it. It is used (or computed) if one of the types is based on an + /// unnamed type. + std::string getName(ID Id, ArrayRef Tys, Module *M, FunctionType *FT); /// Return the function type for an intrinsic. FunctionType *getType(LLVMContext &Context, ID id, diff --git a/llvm/include/llvm/IR/Module.h b/llvm/include/llvm/IR/Module.h index a27f44ed9d31..6abe67575bbf 100644 --- a/llvm/include/llvm/IR/Module.h +++ b/llvm/include/llvm/IR/Module.h @@ -197,6 +197,14 @@ private: ///< Format: (arch)(sub)-(vendor)-(sys0-(abi) NamedMDSymTabType NamedMDSymTab; ///< NamedMDNode names. DataLayout DL; ///< DataLayout associated with the module + StringMap + CurrentIntrinsicIds; ///< Keep track of the current unique id count for + ///< the specified intrinsic basename. + DenseMap, unsigned> + UniquedIntrinsicNames; ///< Keep track of uniqued names of intrinsics + ///< based on unnamed types. The combination of + ///< ID and FunctionType maps to the extension that + ///< is used to make the intrinsic name unique. friend class Constant; @@ -331,6 +339,11 @@ public: std::vector getIdentifiedStructTypes() const; + /// Return a unique name for an intrinsic whose mangling is based on an + /// unnamed type. The Proto represents the function prototype. + std::string getUniqueIntrinsicName(StringRef BaseName, Intrinsic::ID Id, + const FunctionType *Proto); + /// @} /// @name Function Accessors /// @{ diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 46aec7294572..ab8d425ef44c 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -726,30 +726,34 @@ void Function::recalculateIntrinsicID() { /// which can't be confused with it's prefix. This ensures we don't have /// collisions between two unrelated function types. Otherwise, you might /// parse ffXX as f(fXX) or f(fX)X. (X is a placeholder for any other type.) -/// -static std::string getMangledTypeStr(Type* Ty) { +/// The HasUnnamedType boolean is set if an unnamed type was encountered, +/// indicating that extra care must be taken to ensure a unique name. +static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) { std::string Result; if (PointerType* PTyp = dyn_cast(Ty)) { Result += "p" + utostr(PTyp->getAddressSpace()) + - getMangledTypeStr(PTyp->getElementType()); + getMangledTypeStr(PTyp->getElementType(), HasUnnamedType); } else if (ArrayType* ATyp = dyn_cast(Ty)) { Result += "a" + utostr(ATyp->getNumElements()) + - getMangledTypeStr(ATyp->getElementType()); + getMangledTypeStr(ATyp->getElementType(), HasUnnamedType); } else if (StructType *STyp = dyn_cast(Ty)) { if (!STyp->isLiteral()) { Result += "s_"; - Result += STyp->getName(); + if (STyp->hasName()) + Result += STyp->getName(); + else + HasUnnamedType = true; } else { Result += "sl_"; for (auto Elem : STyp->elements()) - Result += getMangledTypeStr(Elem); + Result += getMangledTypeStr(Elem, HasUnnamedType); } // Ensure nested structs are distinguishable. Result += "s"; } else if (FunctionType *FT = dyn_cast(Ty)) { - Result += "f_" + getMangledTypeStr(FT->getReturnType()); + Result += "f_" + getMangledTypeStr(FT->getReturnType(), HasUnnamedType); for (size_t i = 0; i < FT->getNumParams(); i++) - Result += getMangledTypeStr(FT->getParamType(i)); + Result += getMangledTypeStr(FT->getParamType(i), HasUnnamedType); if (FT->isVarArg()) Result += "vararg"; // Ensure nested function types are distinguishable. @@ -759,7 +763,7 @@ static std::string getMangledTypeStr(Type* Ty) { if (EC.isScalable()) Result += "nx"; Result += "v" + utostr(EC.getKnownMinValue()) + - getMangledTypeStr(VTy->getElementType()); + getMangledTypeStr(VTy->getElementType(), HasUnnamedType); } else if (Ty) { switch (Ty->getTypeID()) { default: llvm_unreachable("Unhandled type"); @@ -789,17 +793,32 @@ StringRef Intrinsic::getName(ID id) { return IntrinsicNameTable[id]; } -std::string Intrinsic::getName(ID id, ArrayRef Tys) { - assert(id < num_intrinsics && "Invalid intrinsic ID!"); - assert((Tys.empty() || Intrinsic::isOverloaded(id)) && +std::string Intrinsic::getName(ID Id, ArrayRef Tys, Module *M, + FunctionType *FT) { + assert(Id < num_intrinsics && "Invalid intrinsic ID!"); + assert((Tys.empty() || Intrinsic::isOverloaded(Id)) && "This version of getName is for overloaded intrinsics only"); - std::string Result(IntrinsicNameTable[id]); + bool HasUnnamedType = false; + std::string Result(IntrinsicNameTable[Id]); for (Type *Ty : Tys) { - Result += "." + getMangledTypeStr(Ty); + Result += "." + getMangledTypeStr(Ty, HasUnnamedType); + } + assert((M || !HasUnnamedType) && "unnamed types need a module"); + if (M && HasUnnamedType) { + if (!FT) + FT = getType(M->getContext(), Id, Tys); + else + assert((FT == getType(M->getContext(), Id, Tys)) && + "Provided FunctionType must match arguments"); + return M->getUniqueIntrinsicName(Result, Id, FT); } return Result; } +std::string Intrinsic::getName(ID Id, ArrayRef Tys) { + return getName(Id, Tys, nullptr, nullptr); +} + /// IIT_Info - These are enumerators that describe the entries returned by the /// getIntrinsicInfoTableEntries function. /// @@ -1259,8 +1278,10 @@ bool Intrinsic::isLeaf(ID id) { Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef Tys) { // There can never be multiple globals with the same name of different types, // because intrinsics must be a specific type. + auto *FT = getType(M->getContext(), id, Tys); return cast( - M->getOrInsertFunction(Tys.empty() ? getName(id) : getName(id, Tys), + M->getOrInsertFunction(Tys.empty() ? getName(id) + : getName(id, Tys, M, FT), getType(M->getContext(), id, Tys)) .getCallee()); } @@ -1573,7 +1594,8 @@ Optional Intrinsic::remangleIntrinsicFunction(Function *F) { Intrinsic::ID ID = F->getIntrinsicID(); StringRef Name = F->getName(); - if (Name == Intrinsic::getName(ID, ArgTys)) + if (Name == + Intrinsic::getName(ID, ArgTys, F->getParent(), F->getFunctionType())) return None; auto NewDecl = Intrinsic::getDeclaration(F->getParent(), ID, ArgTys); diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 60056f142d8a..b9c3663b8fa3 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -473,6 +473,56 @@ std::vector Module::getIdentifiedStructTypes() const { return Ret; } +std::string Module::getUniqueIntrinsicName(StringRef BaseName, Intrinsic::ID Id, + const FunctionType *Proto) { + auto Encode = [&BaseName](unsigned Suffix) { + return (Twine(BaseName) + "." + Twine(Suffix)).str(); + }; + + { + // fast path - the prototype is already known + auto UinItInserted = UniquedIntrinsicNames.insert({{Id, Proto}, 0}); + if (!UinItInserted.second) + return Encode(UinItInserted.first->second); + } + + // Not known yet. A new entry was created with index 0. Check if there already + // exists a matching declaration, or select a new entry. + + // Start looking for names with the current known maximum count (or 0). + auto NiidItInserted = CurrentIntrinsicIds.insert({BaseName, 0}); + unsigned Count = NiidItInserted.first->second; + + // This might be slow if a whole population of intrinsics already existed, but + // we cache the values for later usage. + std::string NewName; + while (true) { + NewName = Encode(Count); + GlobalValue *F = getNamedValue(NewName); + if (!F) { + // Reserve this entry for the new proto + UniquedIntrinsicNames[{Id, Proto}] = Count; + break; + } + + // A declaration with this name already exists. Remember it. + FunctionType *FT = dyn_cast(F->getType()->getElementType()); + auto UinItInserted = UniquedIntrinsicNames.insert({{Id, FT}, Count}); + if (FT == Proto) { + // It was a declaration for our prototype. This entry was allocated in the + // beginning. Update the count to match the existing declaration. + UinItInserted.first->second = Count; + break; + } + + ++Count; + } + + NiidItInserted.first->second = Count + 1; + + return NewName; +} + // dropAllReferences() - This function causes all the subelements to "let go" // of all references that they are maintaining. This allows one to 'delete' a // whole module at a time, even though there may be circular references... first diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index b7a002b0573b..595cc6d04cd7 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4542,7 +4542,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { // know they are legal for the intrinsic!) get the intrinsic name through the // usual means. This allows us to verify the mangling of argument types into // the name. - const std::string ExpectedName = Intrinsic::getName(ID, ArgTys); + const std::string ExpectedName = + Intrinsic::getName(ID, ArgTys, IF->getParent(), IFTy); Assert(ExpectedName == IF->getName(), "Intrinsic name not mangled correctly for type arguments! " "Should be: " + diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 1004e4e7d334..f9b9b94911a7 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -460,6 +460,14 @@ class IRLinker { if (DGV->hasLocalLinkage()) return nullptr; + // If we found an intrinsic declaration with mismatching prototypes, we + // probably had a nameclash. Don't use that version. + if (auto *FDGV = dyn_cast(DGV)) + if (FDGV->isIntrinsic()) + if (const auto *FSrcGV = dyn_cast(SrcGV)) + if (FDGV->getFunctionType() != TypeMap.get(FSrcGV->getFunctionType())) + return nullptr; + // Otherwise, we do in fact link to the destination global. return DGV; } @@ -995,6 +1003,7 @@ Expected IRLinker::linkGlobalValueProto(GlobalValue *SGV, return linkAppendingVarProto(cast_or_null(DGV), cast(SGV)); + bool NeedsRenaming = false; GlobalValue *NewGV; if (DGV && !ShouldLink) { NewGV = DGV; @@ -1007,15 +1016,21 @@ Expected IRLinker::linkGlobalValueProto(GlobalValue *SGV, NewGV = copyGlobalValueProto(SGV, ShouldLink || ForIndirectSymbol); if (ShouldLink || !ForIndirectSymbol) - forceRenaming(NewGV, SGV->getName()); + NeedsRenaming = true; } // Overloaded intrinsics have overloaded types names as part of their // names. If we renamed overloaded types we should rename the intrinsic // as well. if (Function *F = dyn_cast(NewGV)) - if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F)) + if (auto Remangled = Intrinsic::remangleIntrinsicFunction(F)) { + NewGV->eraseFromParent(); NewGV = Remangled.getValue(); + NeedsRenaming = false; + } + + if (NeedsRenaming) + forceRenaming(NewGV, SGV->getName()); if (ShouldLink || ForIndirectSymbol) { if (const Comdat *SC = SGV->getComdat()) { diff --git a/llvm/test/Bitcode/intrinsics-with-unnamed-types.ll b/llvm/test/Bitcode/intrinsics-with-unnamed-types.ll new file mode 100644 index 000000000000..02d86ec2c5da --- /dev/null +++ b/llvm/test/Bitcode/intrinsics-with-unnamed-types.ll @@ -0,0 +1,31 @@ +; RUN: llvm-as -o - %s | llvm-dis -o - 2>&1 | FileCheck %s + +; Make sure we can assemble and disassemble IR containing intrinsics with +; unnamed types. + +%1 = type opaque +%0 = type opaque + +; CHECK-LABEL: @f0( +; CHECK: %c1 = call %0* @llvm.ssa.copy.p0s_s.0(%0* %arg) +; CHECK: %c2 = call %1* @llvm.ssa.copy.p0s_s.1(%1* %tmp) +; CHECK: %c3 = call %0** @llvm.ssa.copy.p0p0s_s.1(%0** %arg2) +; CHECK: %c4 = call %1** @llvm.ssa.copy.p0p0s_s.0(%1** %tmp2) + +define void @f0(%0* %arg, %1* %tmp, %1** %tmp2, %0** %arg2) { +bb: + %cmp1 = icmp ne %0* %arg, null + %c1 = call %0* @llvm.ssa.copy.p0s_s.0(%0* %arg) + %c2 = call %1* @llvm.ssa.copy.p0s_s.1(%1* %tmp) + %c3 = call %0** @llvm.ssa.copy.p0p0s_s.1(%0** %arg2) + %c4 = call %1** @llvm.ssa.copy.p0p0s_s.0(%1** %tmp2) + ret void +} + +declare %0* @llvm.ssa.copy.p0s_s.0(%0* returned) + +declare %1* @llvm.ssa.copy.p0s_s.1(%1* returned) + +declare %0** @llvm.ssa.copy.p0p0s_s.1(%0** returned) + +declare %1** @llvm.ssa.copy.p0p0s_s.0(%1** returned) diff --git a/llvm/test/Linker/intrinsics-with-unnamed-types.ll b/llvm/test/Linker/intrinsics-with-unnamed-types.ll new file mode 100644 index 000000000000..76287f698df9 --- /dev/null +++ b/llvm/test/Linker/intrinsics-with-unnamed-types.ll @@ -0,0 +1,101 @@ +; RUN: split-file %s %t +; RUN: llvm-as -o %t1.bc %t/f01.ll +; RUN: llvm-as -o %t2.bc %t/f02.ll +; RUN: llvm-link %t1.bc %t2.bc -o %t3.bc +; RUN: llvm-dis -o - %t3.bc | FileCheck %s + +; Make sure we can link files with clashing intrinsic names using unnamed types. + +;--- f01.ll +%1 = type opaque +%0 = type opaque + +; CHECK-LABEL: @test01( +; CHECK: %cmp1 = icmp ne %0* %arg, null +; CHECK-NEXT: %c1 = call %0* @llvm.ssa.copy.p0s_s.0(%0* %arg) +; CHECK-NEXT: %c2 = call %1* @llvm.ssa.copy.p0s_s.1(%1* %tmp) +; CHECK-NEXT: %c3a = call %0** @llvm.ssa.copy.p0p0s_s.0(%0** %arg2) +; CHECK-NEXT: %c3b = call %0** @llvm.ssa.copy.p0p0s_s.0(%0** %arg2) +; CHECK-NEXT: %c4a = call %1** @llvm.ssa.copy.p0p0s_s.1(%1** %tmp2) +; CHECK-NEXT: %c4ba = call %1** @llvm.ssa.copy.p0p0s_s.1(%1** %tmp2) +; CHECK-NEXT: %c5 = call %0*** @llvm.ssa.copy.p0p0p0s_s.0(%0*** %arg3) +; CHECK-NEXT: %c6 = call %1*** @llvm.ssa.copy.p0p0p0s_s.1(%1*** %tmp3) + +define void @test01(%0* %arg, %1* %tmp, %1** %tmp2, %0** %arg2, %1*** %tmp3, %0*** %arg3) { +bb: + %cmp1 = icmp ne %0* %arg, null + %c1 = call %0* @llvm.ssa.copy.p0s_s.0(%0* %arg) + %c2 = call %1* @llvm.ssa.copy.p0s_s.1(%1* %tmp) + %c3a = call %0** @llvm.ssa.copy.p0p0s_s.1(%0** %arg2) + %c3b = call %0** @llvm.ssa.copy.p0p0s_s.1(%0** %arg2) + %c4a = call %1** @llvm.ssa.copy.p0p0s_s.0(%1** %tmp2) + %c4ba = call %1** @llvm.ssa.copy.p0p0s_s.0(%1** %tmp2) + %c5 = call %0*** @llvm.ssa.copy.p0p0p0s_s.1(%0*** %arg3) + %c6 = call %1*** @llvm.ssa.copy.p0p0p0s_s.0(%1*** %tmp3) + ret void +} + +declare %0* @llvm.ssa.copy.p0s_s.0(%0* returned) + +declare %1* @llvm.ssa.copy.p0s_s.1(%1* returned) + +declare %0** @llvm.ssa.copy.p0p0s_s.1(%0** returned) + +declare %1** @llvm.ssa.copy.p0p0s_s.0(%1** returned) + +declare %0*** @llvm.ssa.copy.p0p0p0s_s.1(%0*** returned) + +declare %1*** @llvm.ssa.copy.p0p0p0s_s.0(%1*** returned) + +; now with recycling of previous declarations: +; CHECK-LABEL: @test02( +; CHECK: %cmp1 = icmp ne %0* %arg, null +; CHECK-NEXT: %c4a = call %1** @llvm.ssa.copy.p0p0s_s.1(%1** %tmp2) +; CHECK-NEXT: %c6 = call %1*** @llvm.ssa.copy.p0p0p0s_s.1(%1*** %tmp3) +; CHECK-NEXT: %c1 = call %0* @llvm.ssa.copy.p0s_s.0(%0* %arg) +; CHECK-NEXT: %c2 = call %1* @llvm.ssa.copy.p0s_s.1(%1* %tmp) +; CHECK-NEXT: %c3b = call %0** @llvm.ssa.copy.p0p0s_s.0(%0** %arg2) +; CHECK-NEXT: %c4ba = call %1** @llvm.ssa.copy.p0p0s_s.1(%1** %tmp2) +; CHECK-NEXT: %c5 = call %0*** @llvm.ssa.copy.p0p0p0s_s.0(%0*** %arg3) + +define void @test02(%0* %arg, %1* %tmp, %1** %tmp2, %0** %arg2, %1*** %tmp3, %0*** %arg3) { +bb: + %cmp1 = icmp ne %0* %arg, null + %c4a = call %1** @llvm.ssa.copy.p0p0s_s.0(%1** %tmp2) + %c6 = call %1*** @llvm.ssa.copy.p0p0p0s_s.0(%1*** %tmp3) + %c1 = call %0* @llvm.ssa.copy.p0s_s.0(%0* %arg) + %c2 = call %1* @llvm.ssa.copy.p0s_s.1(%1* %tmp) + %c3b = call %0** @llvm.ssa.copy.p0p0s_s.1(%0** %arg2) + %c4ba = call %1** @llvm.ssa.copy.p0p0s_s.0(%1** %tmp2) + %c5 = call %0*** @llvm.ssa.copy.p0p0p0s_s.1(%0*** %arg3) + ret void +} + +;--- f02.ll +%1 = type opaque +%2 = type opaque + +; CHECK-LABEL: @test03( +; CHECK: %cmp1 = icmp ne %3* %arg, null +; CHECK-NEXT: %c1 = call %3* @llvm.ssa.copy.p0s_s.2(%3* %arg) +; CHECK-NEXT: %c2 = call %2* @llvm.ssa.copy.p0s_s.3(%2* %tmp) +; CHECK-NEXT: %c3 = call %3** @llvm.ssa.copy.p0p0s_s.2(%3** %arg2) +; CHECK-NEXT: %c4 = call %2** @llvm.ssa.copy.p0p0s_s.3(%2** %tmp2) + +define void @test03(%1* %tmp, %2* %arg, %1** %tmp2, %2** %arg2) { +bb: + %cmp1 = icmp ne %2* %arg, null + %c1 = call %2* @llvm.ssa.copy.p0s_s.0(%2* %arg) + %c2 = call %1* @llvm.ssa.copy.p0s_s.1(%1* %tmp) + %c3 = call %2** @llvm.ssa.copy.p0p0s_s.1(%2** %arg2) + %c4 = call %1** @llvm.ssa.copy.p0p0s_s.0(%1** %tmp2) + ret void +} + +declare %2* @llvm.ssa.copy.p0s_s.0(%2* returned) + +declare %1* @llvm.ssa.copy.p0s_s.1(%1* returned) + +declare %2** @llvm.ssa.copy.p0p0s_s.1(%2** returned) + +declare %1** @llvm.ssa.copy.p0p0s_s.0(%1** returned) diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll new file mode 100644 index 000000000000..9a63501e091f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/X86/pr48340.ll @@ -0,0 +1,54 @@ +; RUN: opt -loop-vectorize --force-vector-width=4 --force-vector-interleave=0 -S -o - < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%0 = type { i32 } +%1 = type { i64 } + +define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK: vector.body: +; CHECK: [[WIDE_MASKED_GATHER0:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP5:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP6:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP7:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP8:%.*]], i32 8, <4 x i1> , <4 x %0*> undef) +entry: + br label %loop + +loop: + %p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ] + %p.inc = getelementptr inbounds i64, i64* %p2, i64 2 + %p3 = bitcast i64* %p2 to %0** + %v = load %0*, %0** %p3, align 8 + %b = icmp eq i64* %p.inc, %p.last + br i1 %b, label %exit, label %loop + +exit: + ret void +} + +define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 { +; CHECK-LABEL: @bar( +; CHECK: vector.body: +; CHECK: [[WIDE_MASKED_GATHER0:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP5:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP6:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP7:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +; CHECK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP8:%.*]], i32 8, <4 x i1> , <4 x %1*> undef) +entry: + br label %loop + +loop: + %p2 = phi i64* [ %p, %entry ], [ %p.inc, %loop ] + %p.inc = getelementptr inbounds i64, i64* %p2, i64 2 + %p3 = bitcast i64* %p2 to %1** + %v = load %1*, %1** %p3, align 8 + %b = icmp eq i64* %p.inc, %p.last + br i1 %b, label %exit, label %loop + +exit: + ret void +} + +attributes #0 = { "target-cpu"="skylake" } + -- GitLab From 2049fe58903b68f66872a18e608f40e5233b55fb Mon Sep 17 00:00:00 2001 From: Maxim Kuvyrkov Date: Fri, 19 Mar 2021 13:37:19 +0000 Subject: [PATCH 0142/1000] [WoA][MSVC] Use default linker setting in MSVC-compatible driver [take 2] At the moment "link.exe" is hard-coded as default linker in MSVC.cpp, so there's no way to use LLD as default linker for MSVC driver. This patch adds checking of CLANG_DEFAULT_LINKER to MSVC.cpp and updates unit-tests that expect link.exe linker to explicitly select it via -fuse-ld=link, so that buildbots and other builds that set -DCLANG_DEFAULT_LINKER=foobar don't fail these tests. This is a squash of - https://reviews.llvm.org/D98493 (MSVC.cpp change) and - https://reviews.llvm.org/D98862 (unit-tests change) Reviewed By: maxim-kuvyrkov Differential Revision: https://reviews.llvm.org/D98935 --- clang/lib/Driver/ToolChains/MSVC.cpp | 6 +++++- clang/test/Driver/Xlinker-args.c | 2 +- clang/test/Driver/cl-inputs.c | 6 +++--- clang/test/Driver/cl-link-at-file.c | 2 +- clang/test/Driver/cl-link.c | 22 +++++++++++----------- clang/test/Driver/msvc-link.c | 8 ++++---- clang/test/OpenMP/linking.c | 4 ++-- 7 files changed, 27 insertions(+), 23 deletions(-) diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index 96de02378ca2..877919e11464 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -11,6 +11,7 @@ #include "Darwin.h" #include "clang/Basic/CharInfo.h" #include "clang/Basic/Version.h" +#include "clang/Config/config.h" #include "clang/Driver/Compilation.h" #include "clang/Driver/Driver.h" #include "clang/Driver/DriverDiagnostic.h" @@ -577,7 +578,10 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA, // translate 'lld' into 'lld-link', and in the case of the regular msvc // linker, we need to use a special search algorithm. llvm::SmallString<128> linkPath; - StringRef Linker = Args.getLastArgValue(options::OPT_fuse_ld_EQ, "link"); + StringRef Linker + = Args.getLastArgValue(options::OPT_fuse_ld_EQ, CLANG_DEFAULT_LINKER); + if (Linker.empty()) + Linker = "link"; if (Linker.equals_lower("lld")) Linker = "lld-link"; diff --git a/clang/test/Driver/Xlinker-args.c b/clang/test/Driver/Xlinker-args.c index a44957cd8aef..cb045a1d40ac 100644 --- a/clang/test/Driver/Xlinker-args.c +++ b/clang/test/Driver/Xlinker-args.c @@ -17,7 +17,7 @@ // LINUX: "--no-demangle" "-e" "_start" "one" "two" "three" "four" "-z" "five" "-r" {{.*}} "-T" "a.lds" // Check that we forward '-Xlinker' and '-Wl,' on Windows. -// RUN: %clang -target i686-pc-win32 -### \ +// RUN: %clang -target i686-pc-win32 -fuse-ld=link -### \ // RUN: -Xlinker one -Wl,two %s 2>&1 | \ // RUN: FileCheck -check-prefix=WIN %s // WIN: link.exe diff --git a/clang/test/Driver/cl-inputs.c b/clang/test/Driver/cl-inputs.c index 59455a0aa5e5..8eb44517ee16 100644 --- a/clang/test/Driver/cl-inputs.c +++ b/clang/test/Driver/cl-inputs.c @@ -50,16 +50,16 @@ // RUN: %clang_cl -### /Tc - 2>&1 | FileCheck -check-prefix=STDINTc %s // STDINTc: "-x" "c" -// RUN: env LIB=%S/Inputs/cl-libs %clang_cl -### -- %s cl-test.lib 2>&1 | FileCheck -check-prefix=LIBINPUT %s +// RUN: env LIB=%S/Inputs/cl-libs %clang_cl -fuse-ld=link -### -- %s cl-test.lib 2>&1 | FileCheck -check-prefix=LIBINPUT %s // LIBINPUT: link.exe" // LIBINPUT: "cl-test.lib" -// RUN: env LIB=%S/Inputs/cl-libs %clang_cl -### -- %s cl-test2.lib 2>&1 | FileCheck -check-prefix=LIBINPUT2 %s +// RUN: env LIB=%S/Inputs/cl-libs %clang_cl -fuse-ld=link -### -- %s cl-test2.lib 2>&1 | FileCheck -check-prefix=LIBINPUT2 %s // LIBINPUT2: error: no such file or directory: 'cl-test2.lib' // LIBINPUT2: link.exe" // LIBINPUT2-NOT: "cl-test2.lib" -// RUN: %clang_cl -### -- %s /nonexisting.lib 2>&1 | FileCheck -check-prefix=LIBINPUT3 %s +// RUN: %clang_cl -fuse-ld=link -### -- %s /nonexisting.lib 2>&1 | FileCheck -check-prefix=LIBINPUT3 %s // LIBINPUT3: error: no such file or directory: '/nonexisting.lib' // LIBINPUT3: link.exe" // LIBINPUT3-NOT: "/nonexisting.lib" diff --git a/clang/test/Driver/cl-link-at-file.c b/clang/test/Driver/cl-link-at-file.c index 50ae07fadf5b..4e665f89b74e 100644 --- a/clang/test/Driver/cl-link-at-file.c +++ b/clang/test/Driver/cl-link-at-file.c @@ -7,7 +7,7 @@ // RUN: echo /link bar.lib baz.lib > %t.args // RUN: touch %t.obj -// RUN: %clang_cl -### @%t.args -- %t.obj 2>&1 | FileCheck %s -check-prefix=ARGS +// RUN: %clang_cl -fuse-ld=link -### @%t.args -- %t.obj 2>&1 | FileCheck %s -check-prefix=ARGS // If the "/link" option captures all remaining args beyond its response file, // it will also capture "--" and our input argument. In this case, Clang will // be clueless and will emit "argument unused" warnings. If PR17239 is properly diff --git a/clang/test/Driver/cl-link.c b/clang/test/Driver/cl-link.c index 142725fed8eb..e2f5397e9133 100644 --- a/clang/test/Driver/cl-link.c +++ b/clang/test/Driver/cl-link.c @@ -2,14 +2,14 @@ // be interpreted as a command-line option, e.g. on Mac where %s is commonly // under /Users. -// RUN: %clang_cl /Tc%s -### /link foo bar baz 2>&1 | FileCheck --check-prefix=LINK %s -// RUN: %clang_cl /Tc%s -### /linkfoo bar baz 2>&1 | FileCheck --check-prefix=LINK %s +// RUN: %clang_cl /Tc%s -fuse-ld=link -### /link foo bar baz 2>&1 | FileCheck --check-prefix=LINK %s +// RUN: %clang_cl /Tc%s -fuse-ld=link -### /linkfoo bar baz 2>&1 | FileCheck --check-prefix=LINK %s // LINK: link.exe // LINK: "foo" // LINK: "bar" // LINK: "baz" -// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /Tc%s -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN %s +// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN %s // ASAN: link.exe // ASAN: "-debug" // ASAN: "-incremental:no" @@ -19,7 +19,7 @@ // ASAN: "-wholearchive:{{.*}}clang_rt.asan_cxx-i386.lib" // ASAN: "{{.*}}cl-link{{.*}}.obj" -// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /MD /Tc%s -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-MD %s +// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /MD /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-MD %s // ASAN-MD: link.exe // ASAN-MD: "-debug" // ASAN-MD: "-incremental:no" @@ -29,13 +29,13 @@ // ASAN-MD: "-wholearchive:{{.*}}clang_rt.asan_dynamic_runtime_thunk-i386.lib" // ASAN-MD: "{{.*}}cl-link{{.*}}.obj" -// RUN: %clang_cl /LD -### /Tc%s 2>&1 | FileCheck --check-prefix=DLL %s -// RUN: %clang_cl /LDd -### /Tc%s 2>&1 | FileCheck --check-prefix=DLL %s +// RUN: %clang_cl /LD -fuse-ld=link -### /Tc%s 2>&1 | FileCheck --check-prefix=DLL %s +// RUN: %clang_cl /LDd -fuse-ld=link -### /Tc%s 2>&1 | FileCheck --check-prefix=DLL %s // DLL: link.exe // "-dll" -// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /LD /Tc%s -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-DLL %s -// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /LDd /Tc%s -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-DLL %s +// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /LD /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-DLL %s +// RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /LDd /Tc%s -fuse-ld=link -### -fsanitize=address 2>&1 | FileCheck --check-prefix=ASAN-DLL %s // ASAN-DLL: link.exe // ASAN-DLL: "-dll" // ASAN-DLL: "-debug" @@ -43,13 +43,13 @@ // ASAN-DLL: "{{.*}}clang_rt.asan_dll_thunk-i386.lib" // ASAN-DLL: "{{.*}}cl-link{{.*}}.obj" -// RUN: %clang_cl /Zi /Tc%s -### 2>&1 | FileCheck --check-prefix=DEBUG %s +// RUN: %clang_cl /Zi /Tc%s -fuse-ld=link -### 2>&1 | FileCheck --check-prefix=DEBUG %s // DEBUG: link.exe // DEBUG: "-debug" // PR27234 -// RUN: %clang_cl /Tc%s nonexistent.obj -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s -// RUN: %clang_cl /Tc%s nonexistent.lib -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s +// RUN: %clang_cl /Tc%s nonexistent.obj -fuse-ld=link -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s +// RUN: %clang_cl /Tc%s nonexistent.lib -fuse-ld=link -### /link /libpath:somepath 2>&1 | FileCheck --check-prefix=NONEXISTENT %s // NONEXISTENT-NOT: no such file // NONEXISTENT: link.exe // NONEXISTENT: "/libpath:somepath" diff --git a/clang/test/Driver/msvc-link.c b/clang/test/Driver/msvc-link.c index 13dccd21bfd8..1ee17fc63c32 100644 --- a/clang/test/Driver/msvc-link.c +++ b/clang/test/Driver/msvc-link.c @@ -1,4 +1,4 @@ -// RUN: %clang -target i686-pc-windows-msvc -### %s 2>&1 | FileCheck --check-prefix=BASIC %s +// RUN: %clang -target i686-pc-windows-msvc -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=BASIC %s // BASIC: link.exe" // BASIC: "-out:a.exe" // BASIC: "-defaultlib:libcmt" @@ -6,7 +6,7 @@ // BASIC: "-nologo" // BASIC-NOT: "-Brepro" -// RUN: %clang -target i686-pc-windows-msvc -shared -o a.dll -### %s 2>&1 | FileCheck --check-prefix=DLL %s +// RUN: %clang -target i686-pc-windows-msvc -shared -o a.dll -fuse-ld=link -### %s 2>&1 | FileCheck --check-prefix=DLL %s // DLL: link.exe" // DLL: "-out:a.dll" // DLL: "-defaultlib:libcmt" @@ -19,13 +19,13 @@ // LIBPATH: "-libpath:/usr/lib" // LIBPATH: "-nologo" -// RUN: %clang_cl /Brepro -### -- %s 2>&1 | FileCheck --check-prefix=REPRO %s +// RUN: %clang_cl /Brepro -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=REPRO %s // REPRO: link.exe" // REPRO: "-out:msvc-link.exe" // REPRO: "-nologo" // REPRO: "-Brepro" -// RUN: %clang_cl /Brepro- -### -- %s 2>&1 | FileCheck --check-prefix=NOREPRO %s +// RUN: %clang_cl /Brepro- -fuse-ld=link -### -- %s 2>&1 | FileCheck --check-prefix=NOREPRO %s // NOREPRO: link.exe" // NOREPRO: "-out:msvc-link.exe" // NOREPRO: "-nologo" diff --git a/clang/test/OpenMP/linking.c b/clang/test/OpenMP/linking.c index 802553c1be75..1c4439626470 100644 --- a/clang/test/OpenMP/linking.c +++ b/clang/test/OpenMP/linking.c @@ -81,7 +81,7 @@ // CHECK-LD-OVERRIDE-64: "-lgomp" "-lrt" // CHECK-LD-OVERRIDE-64: "-lpthread" "-lc" // -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ +// RUN: %clang -no-canonical-prefixes -fuse-ld=link %s -### -o %t.o 2>&1 \ // RUN: -fopenmp=libomp -target x86_64-msvc-win32 -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-MSVC-LINK-64 %s // CHECK-MSVC-LINK-64: link.exe @@ -95,7 +95,7 @@ // SIMD-ONLY11-NOT: libomp // SIMD-ONLY11-NOT: libgomp // -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ +// RUN: %clang -no-canonical-prefixes %s -fuse-ld=link -### -o %t.o 2>&1 \ // RUN: -fopenmp=libiomp5 -target x86_64-msvc-win32 -rtlib=platform \ // RUN: | FileCheck --check-prefix=CHECK-MSVC-ILINK-64 %s -- GitLab From 028d6250eac5b8ec3624daaff954d9e52108caf4 Mon Sep 17 00:00:00 2001 From: Ricky Taylor Date: Wed, 17 Mar 2021 21:34:36 +0000 Subject: [PATCH 0143/1000] [M68k] Replace unknown operand with explicit type Replace the unknown operand used for immediate operands for DIV/MUL with a fixed 16-bit immediate. This is required since the assembly parser generator requires that all operands are typed. Differential Revision: https://reviews.llvm.org/D98819 --- llvm/lib/Target/M68k/M68kInstrArithmetic.td | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/M68k/M68kInstrArithmetic.td b/llvm/lib/Target/M68k/M68kInstrArithmetic.td index d6ecec07439d..81286c8f162c 100644 --- a/llvm/lib/Target/M68k/M68kInstrArithmetic.td +++ b/llvm/lib/Target/M68k/M68kInstrArithmetic.td @@ -522,7 +522,7 @@ class MxDiMuOp_DD CMD, MxBead3Bits OPMODE, // $reg <- $reg op $imm class MxDiMuOp_DI CMD, MxBead3Bits OPMODE, MxOperand DST, MxOperand OPD> - : MxInst<(outs DST:$dst), (ins DST:$src, unknown:$opd), MN#"\t$opd, $dst", [], + : MxInst<(outs DST:$dst), (ins DST:$src, OPD:$opd), MN#"\t$opd, $dst", [], MxDiMuEncoding, OPMODE, MxEncEAi, MxExtI16_2>>; } // let Constraints } // Defs = [CCR] @@ -545,6 +545,12 @@ multiclass MxDiMuOp CMD, bit isComm = 0> { defm DIV : MxDiMuOp<"div", 0x8>; +// This is used to cast immediates to 16-bits for operations which don't +// support smaller immediate sizes. +def as_i16imm : SDNodeXFormgetTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16); +}]>; + // RR i8 def : Pat<(sdiv i8:$dst, i8:$opd), (EXTRACT_SUBREG @@ -591,22 +597,22 @@ def : Pat<(urem i16:$dst, i16:$opd), // RI i8 def : Pat<(sdiv i8:$dst, MximmSExt8:$opd), (EXTRACT_SUBREG - (SDIVd32i16 (MOVSXd32d8 $dst), imm:$opd), + (SDIVd32i16 (MOVSXd32d8 $dst), (as_i16imm $opd)), MxSubRegIndex8Lo)>; def : Pat<(udiv i8:$dst, MximmSExt8:$opd), (EXTRACT_SUBREG - (UDIVd32i16 (MOVZXd32d8 $dst), imm:$opd), + (UDIVd32i16 (MOVZXd32d8 $dst), (as_i16imm $opd)), MxSubRegIndex8Lo)>; def : Pat<(srem i8:$dst, MximmSExt8:$opd), (EXTRACT_SUBREG - (ASR32di (ASR32di (SDIVd32i16 (MOVSXd32d8 $dst), imm:$opd), 8), 8), + (ASR32di (ASR32di (SDIVd32i16 (MOVSXd32d8 $dst), (as_i16imm $opd)), 8), 8), MxSubRegIndex8Lo)>; def : Pat<(urem i8:$dst, MximmSExt8:$opd), (EXTRACT_SUBREG - (LSR32di (LSR32di (UDIVd32i16 (MOVZXd32d8 $dst), imm:$opd), 8), 8), + (LSR32di (LSR32di (UDIVd32i16 (MOVZXd32d8 $dst), (as_i16imm $opd)), 8), 8), MxSubRegIndex8Lo)>; // RI i16 -- GitLab From a9fc44c5573208859c2550382755098d750fc47d Mon Sep 17 00:00:00 2001 From: "Paul C. Anagnostopoulos" Date: Thu, 25 Feb 2021 16:33:08 -0500 Subject: [PATCH 0144/1000] [TableGen] Improve handling of template arguments This requires changes to TableGen files and some C++ files due to incompatible multiclass template arguments that slipped through before the improved handling. --- clang/utils/TableGen/MveEmitter.cpp | 13 +- llvm/docs/TableGen/ProgRef.rst | 21 +- llvm/include/llvm/TableGen/Record.h | 6 + llvm/lib/TableGen/Record.cpp | 14 +- llvm/lib/TableGen/TGParser.cpp | 346 +++++++++--------- llvm/lib/TableGen/TGParser.h | 8 +- .../test/TableGen/self-reference-typeerror.td | 9 +- llvm/test/TableGen/template-args.td | 142 +++++++ 8 files changed, 362 insertions(+), 197 deletions(-) create mode 100644 llvm/test/TableGen/template-args.td diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp index e9ae08ac4c05..091af2dc52a1 100644 --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -1272,6 +1272,13 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum, return it->second; } + // Sometimes the Arg is a bit. Prior to multiclass template argument + // checking, integers would sneak through the bit declaration, + // but now they really are bits. + if (auto *BI = dyn_cast(Arg)) + return std::make_shared(getScalarType("u32"), + BI->getValue()); + if (auto *II = dyn_cast(Arg)) return std::make_shared(getScalarType("u32"), II->getValue()); @@ -1287,7 +1294,11 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum, } } - PrintFatalError("bad dag argument type for code generation"); + PrintError("bad DAG argument type for code generation"); + PrintNote("DAG: " + D->getAsString()); + if (TypedInit *Typed = dyn_cast(Arg)) + PrintNote("argument type: " + Typed->getType()->getAsString()); + PrintFatalNote("argument number " + Twine(ArgNum) + ": " + Arg->getAsString()); } Result::Ptr EmitterBase::getCodeForArg(unsigned ArgNum, const Type *ArgType, diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst index c60bffef3ed2..9799e29a63e6 100644 --- a/llvm/docs/TableGen/ProgRef.rst +++ b/llvm/docs/TableGen/ProgRef.rst @@ -299,7 +299,7 @@ wide range of records conveniently and compactly. :token:`ClassID` Specifying a class name in a type context indicates that the type of the defined value must - be a subclass of the specified class. This is useful in conjunction with + be a subclass of the specified class. This is useful in conjunction with the ``list`` type; for example, to constrain the elements of the list to a common base class (e.g., a ``list`` can only contain definitions derived from the ``Register`` class). @@ -554,19 +554,22 @@ classes and records can inherit. TemplateArgDecl: `Type` `TokIdentifier` ["=" `Value`] A class can be parameterized by a list of "template arguments," whose values -can be used in the class's record body. These template arguments are +can be used in the class's record body. These template arguments are specified each time the class is inherited by another class or record. If a template argument is not assigned a default value with ``=``, it is uninitialized (has the "value" ``?``) and must be specified in the template -argument list when the class is inherited. If an argument is assigned a -default value, then it need not be specified in the argument list. The -template argument default values are evaluated from left to right. +argument list when the class is inherited (required argument). If an +argument is assigned a default value, then it need not be specified in the +argument list (optional argument). In the declaration, all required template +arguments must precede any optional arguments. The template argument default +values are evaluated from left to right. The :token:`RecordBody` is defined below. It can include a list of -superclasses from which the current class inherits, along with field definitions -and other statements. When a class ``C`` inherits from another class ``D``, -the fields of ``D`` are effectively merged into the fields of ``C``. +superclasses from which the current class inherits, along with field +definitions and other statements. When a class ``C`` inherits from another +class ``D``, the fields of ``D`` are effectively merged into the fields of +``C``. A given class can only be defined once. A ``class`` statement is considered to define the class if *any* of the following are true (the @@ -605,7 +608,7 @@ of the fields of the class or record. RecordBody: `ParentClassList` `Body` ParentClassList: [":" `ParentClassListNE`] ParentClassListNE: `ClassRef` ("," `ClassRef`)* - ClassRef: (`ClassID` | `MultiClassID`) ["<" `ValueList` ">"] + ClassRef: (`ClassID` | `MultiClassID`) ["<" [`ValueList`] ">"] A :token:`ParentClassList` containing a :token:`MultiClassID` is valid only in the class list of a ``defm`` statement. In that case, the ID must be the diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h index e75b7f01c868..ea47d6713026 100644 --- a/llvm/include/llvm/TableGen/Record.h +++ b/llvm/include/llvm/TableGen/Record.h @@ -2024,6 +2024,12 @@ public: void set(Init *Key, Init *Value) { Map[Key] = {Value, false}; } + bool isComplete(Init *VarName) const { + auto It = Map.find(VarName); + assert(It != Map.end() && "key must be present in map"); + return It->second.V->isComplete(); + } + Init *resolve(Init *VarName) override; }; diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp index 13212098514d..3172d711e7f6 100644 --- a/llvm/lib/TableGen/Record.cpp +++ b/llvm/lib/TableGen/Record.cpp @@ -2344,13 +2344,13 @@ void Record::resolveReferences(Resolver &R, const RecordVal *SkipVal) { if (TypedInit *VRT = dyn_cast(VR)) Type = (Twine("of type '") + VRT->getType()->getAsString() + "' ").str(); - PrintFatalError(getLoc(), Twine("Invalid value ") + Type + - "is found when setting '" + - Value.getNameInitAsString() + - "' of type '" + - Value.getType()->getAsString() + - "' after resolving references: " + - VR->getAsUnquotedString() + "\n"); + PrintFatalError( + getLoc(), + Twine("Invalid value ") + Type + "found when setting field '" + + Value.getNameInitAsString() + "' of type '" + + Value.getType()->getAsString() + + "' after resolving references: " + VR->getAsUnquotedString() + + "\n"); } } } diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp index 87faf77671c6..974df42de4c1 100644 --- a/llvm/lib/TableGen/TGParser.cpp +++ b/llvm/lib/TableGen/TGParser.cpp @@ -229,38 +229,33 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName, /// args as SubClass's template arguments. bool TGParser::AddSubClass(Record *CurRec, SubClassReference &SubClass) { Record *SC = SubClass.Rec; - // Add all of the values in the subclass into the current class. - for (const RecordVal &Val : SC->getValues()) - if (AddValue(CurRec, SubClass.RefRange.Start, Val)) - return true; - - ArrayRef TArgs = SC->getTemplateArgs(); - - // Ensure that an appropriate number of template arguments are specified. - if (TArgs.size() < SubClass.TemplateArgs.size()) - return Error(SubClass.RefRange.Start, - "More template args specified than expected"); - - // Loop over all of the template arguments, setting them to the specified - // value or leaving them as the default if necessary. MapResolver R(CurRec); - for (unsigned i = 0, e = TArgs.size(); i != e; ++i) { - if (i < SubClass.TemplateArgs.size()) { - // If a value is specified for this template arg, set it now. - if (SetValue(CurRec, SubClass.RefRange.Start, TArgs[i], - None, SubClass.TemplateArgs[i])) + // Loop over all the subclass record's fields. Add template arguments + // to the resolver map. Add regular fields to the new record. + for (const RecordVal &Field : SC->getValues()) { + if (Field.isTemplateArg()) { + R.set(Field.getNameInit(), Field.getValue()); + } else { + if (AddValue(CurRec, SubClass.RefRange.Start, Field)) return true; - } else if (!CurRec->getValue(TArgs[i])->getValue()->isComplete()) { - return Error(SubClass.RefRange.Start, - "Value not specified for template argument #" + - Twine(i) + " (" + TArgs[i]->getAsUnquotedString() + - ") of subclass '" + SC->getNameInitAsString() + "'!"); } + } - R.set(TArgs[i], CurRec->getValue(TArgs[i])->getValue()); - - CurRec->removeValue(TArgs[i]); + ArrayRef TArgs = SC->getTemplateArgs(); + assert(SubClass.TemplateArgs.size() <= TArgs.size() && + "Too many template arguments allowed"); + + // Loop over the template argument names. If a value was specified, + // reset the map value. If not and there was no default, complain. + for (unsigned I = 0, E = TArgs.size(); I != E; ++I) { + if (I < SubClass.TemplateArgs.size()) + R.set(TArgs[I], SubClass.TemplateArgs[I]); + else if (!R.isComplete(TArgs[I])) + return Error(SubClass.RefRange.Start, + "Value not specified for template argument '" + + TArgs[I]->getAsUnquotedString() + "' (#" + Twine(I) + + ") of parent class '" + SC->getNameInitAsString() + "'"); } Init *Name; @@ -584,8 +579,8 @@ MultiClass *TGParser::ParseMultiClassID() { return Result; } -/// ParseSubClassReference - Parse a reference to a subclass or to a templated -/// subclass. This returns a SubClassRefTy with a null Record* on error. +/// ParseSubClassReference - Parse a reference to a subclass or a +/// multiclass. This returns a SubClassRefTy with a null Record* on error. /// /// SubClassRef ::= ClassID /// SubClassRef ::= ClassID '<' ValueList '>' @@ -609,25 +604,18 @@ ParseSubClassReference(Record *CurRec, bool isDefm) { return Result; } - if (Lex.getCode() == tgtok::greater) { - TokError("subclass reference requires a non-empty list of template values"); - Result.Rec = nullptr; + if (ParseTemplateArgValueList(Result.TemplateArgs, CurRec, Result.Rec)) { + Result.Rec = nullptr; // Error parsing value list. return Result; } - ParseValueList(Result.TemplateArgs, CurRec, Result.Rec); - if (Result.TemplateArgs.empty()) { - Result.Rec = nullptr; // Error parsing value list. + if (CheckTemplateArgValues(Result.TemplateArgs, Result.RefRange.Start, + Result.Rec)) { + Result.Rec = nullptr; // Error checking value list. return Result; } - if (!consume(tgtok::greater)) { - TokError("expected '>' in template value list"); - Result.Rec = nullptr; - return Result; - } Result.RefRange.End = Lex.getLoc(); - return Result; } @@ -652,23 +640,12 @@ ParseSubMultiClassReference(MultiClass *CurMC) { return Result; } - if (Lex.getCode() == tgtok::greater) { - TokError("subclass reference requires a non-empty list of template values"); - Result.MC = nullptr; + if (ParseTemplateArgValueList(Result.TemplateArgs, &CurMC->Rec, + &Result.MC->Rec)) { + Result.MC = nullptr; // Error parsing value list. return Result; } - ParseValueList(Result.TemplateArgs, &CurMC->Rec, &Result.MC->Rec); - if (Result.TemplateArgs.empty()) { - Result.MC = nullptr; // Error parsing value list. - return Result; - } - - if (!consume(tgtok::greater)) { - TokError("expected '>' in template value list"); - Result.MC = nullptr; - return Result; - } Result.RefRange.End = Lex.getLoc(); return Result; @@ -2032,15 +2009,9 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType, if (Lex.Lex() != tgtok::less) // consume the Id. return ParseIDValue(CurRec, Name, NameLoc, Mode); // Value ::= IDValue - // Value ::= ID '<' ValueListNE '>' - if (Lex.Lex() == tgtok::greater) { - TokError("expected non-empty value list"); - return nullptr; - } - - // This is a CLASS expression. This is supposed to synthesize - // a new anonymous definition, deriving from CLASS with no - // body. + // Value ::= CLASSID '<' ValueListNE '>' (CLASSID has been consumed) + // This is supposed to synthesize a new anonymous definition, deriving + // from the class with the template arguments, but no body. Record *Class = Records.getClass(Name->getValue()); if (!Class) { Error(NameLoc, "Expected a class name, got '" + Name->getValue() + "'"); @@ -2048,44 +2019,26 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType, } SmallVector Args; - ParseValueList(Args, CurRec, Class); - if (Args.empty()) return nullptr; - - if (!consume(tgtok::greater)) { - TokError("expected '>' at end of value list"); - return nullptr; - } - - // Typecheck the template arguments list - ArrayRef ExpectedArgs = Class->getTemplateArgs(); - if (ExpectedArgs.size() < Args.size()) { - Error(NameLoc, - "More template args specified than expected"); - return nullptr; - } - - for (unsigned i = 0, e = ExpectedArgs.size(); i != e; ++i) { - RecordVal *ExpectedArg = Class->getValue(ExpectedArgs[i]); - if (i < Args.size()) { - if (TypedInit *TI = dyn_cast(Args[i])) { - RecTy *ExpectedType = ExpectedArg->getType(); - if (!TI->getType()->typeIsConvertibleTo(ExpectedType)) { - Error(NameLoc, - "Value specified for template argument #" + Twine(i) + " (" + - ExpectedArg->getNameInitAsString() + ") is of type '" + - TI->getType()->getAsString() + "', expected '" + - ExpectedType->getAsString() + "': " + TI->getAsString()); - return nullptr; - } - continue; - } - } else if (ExpectedArg->getValue()->isComplete()) - continue; - - Error(NameLoc, - "Value not specified for template argument #" + Twine(i) + " (" + - ExpectedArgs[i]->getAsUnquotedString() + ")"); - return nullptr; + Lex.Lex(); // consume the < + if (ParseTemplateArgValueList(Args, CurRec, Class)) + return nullptr; // Error parsing value list. + + if (CheckTemplateArgValues(Args, NameLoc, Class)) + return nullptr; // Error checking template argument values. + + // Loop through the arguments that were not specified and make sure + // they have a complete value. + // TODO: If we just keep a required argument count, we can do away + // with this checking. + ArrayRef TArgs = Class->getTemplateArgs(); + for (unsigned I = Args.size(), E = TArgs.size(); I < E; ++I) { + RecordVal *Arg = Class->getValue(TArgs[I]); + if (!Arg->getValue()->isComplete()) + Error(NameLoc, "Value not specified for template argument '" + + TArgs[I]->getAsUnquotedString() + "' (#" + Twine(I) + + ") of parent class '" + + Class->getNameInitAsString() + "'"); + } return VarDefInit::get(Class, Args)->Fold(); @@ -2158,7 +2111,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType, } if (Lex.getCode() != tgtok::r_square) { - ParseValueList(Vals, CurRec, nullptr, + ParseValueList(Vals, CurRec, GivenListTy ? GivenListTy->getElementType() : nullptr); if (Vals.empty()) return nullptr; } @@ -2522,32 +2475,15 @@ void TGParser::ParseDagArgList( } } -/// ParseValueList - Parse a comma separated list of values, returning them as a -/// vector. Note that this always expects to be able to parse at least one -/// value. It returns an empty list if this is not possible. +/// ParseValueList - Parse a comma separated list of values, returning them +/// in a vector. Note that this always expects to be able to parse at least one +/// value. It returns an empty list if this is not possible. /// /// ValueList ::= Value (',' Value) /// -void TGParser::ParseValueList(SmallVectorImpl &Result, Record *CurRec, - Record *ArgsRec, RecTy *EltTy) { - RecTy *ItemType = EltTy; - unsigned int ArgN = 0; - if (ArgsRec && !EltTy) { - ArrayRef TArgs = ArgsRec->getTemplateArgs(); - if (TArgs.empty()) { - TokError("template argument provided to non-template class"); - Result.clear(); - return; - } - const RecordVal *RV = ArgsRec->getValue(TArgs[ArgN]); - if (!RV) { - errs() << "Cannot find template arg " << ArgN << " (" << TArgs[ArgN] - << ")\n"; - } - assert(RV && "Template argument record not found??"); - ItemType = RV->getType(); - ++ArgN; - } +void TGParser::ParseValueList(SmallVectorImpl &Result, Record *CurRec, + RecTy *ItemType) { + Result.push_back(ParseValue(CurRec, ItemType)); if (!Result.back()) { Result.clear(); @@ -2558,19 +2494,6 @@ void TGParser::ParseValueList(SmallVectorImpl &Result, Record *CurRec, // ignore trailing comma for lists if (Lex.getCode() == tgtok::r_square) return; - - if (ArgsRec && !EltTy) { - ArrayRef TArgs = ArgsRec->getTemplateArgs(); - if (ArgN >= TArgs.size()) { - TokError("too many template arguments"); - Result.clear(); - return; - } - const RecordVal *RV = ArgsRec->getValue(TArgs[ArgN]); - assert(RV && "Template argument record not found??"); - ItemType = RV->getType(); - ++ArgN; - } Result.push_back(ParseValue(CurRec, ItemType)); if (!Result.back()) { Result.clear(); @@ -2579,9 +2502,48 @@ void TGParser::ParseValueList(SmallVectorImpl &Result, Record *CurRec, } } +// ParseTemplateArgValueList - Parse a template argument list with the syntax +// shown, filling in the Result vector. The open angle has been consumed. +// An empty argument list is allowed. Return false if okay, true if an +// error was detected. +// +// TemplateArgList ::= '<' [Value {',' Value}*] '>' +bool TGParser::ParseTemplateArgValueList(SmallVectorImpl &Result, + Record *CurRec, Record *ArgsRec) { + + assert(Result.empty() && "Result vector is not empty"); + ArrayRef TArgs = ArgsRec->getTemplateArgs(); + unsigned ArgIndex = 0; + RecTy *ItemType; + + if (consume(tgtok::greater)) // empty value list + return false; + + while (true) { + if (ArgIndex >= TArgs.size()) { + TokError("Too many template arguments: " + utostr(ArgIndex + 1)); + return true; + } + const RecordVal *Arg = ArgsRec->getValue(TArgs[ArgIndex]); + assert(Arg && "Template argument record not found"); + + ItemType = Arg->getType(); + Init *Value = ParseValue(CurRec, ItemType); + if (!Value) + return true; + Result.push_back(Value); + + if (consume(tgtok::greater)) // end of argument list? + return false; + if (!consume(tgtok::comma)) // must be comma + return true; + ++ArgIndex; + } +} + /// ParseDeclaration - Read a declaration, returning the name of field ID, or an -/// empty string on error. This can happen in a number of different context's, -/// including within a def or in the template args for a def (which which case +/// empty string on error. This can happen in a number of different contexts, +/// including within a def or in the template args for a class (in which case /// CurRec will be non-null) and within the template args for a multiclass (in /// which case CurRec will be null, but CurMultiClass will be set). This can /// also happen within a def that is within a multiclass, which will set both @@ -2612,23 +2574,28 @@ Init *TGParser::ParseDeclaration(Record *CurRec, Init *DeclName = StringInit::get(Str); Lex.Lex(); - if (ParsingTemplateArgs) { - if (CurRec) - DeclName = QualifyName(*CurRec, CurMultiClass, DeclName, ":"); - else - assert(CurMultiClass); - if (CurMultiClass) - DeclName = QualifyName(CurMultiClass->Rec, CurMultiClass, DeclName, - "::"); - } - - // Add the field to the record. - if (AddValue(CurRec, IdLoc, RecordVal(DeclName, IdLoc, Type, - HasField ? RecordVal::FK_NonconcreteOK - : RecordVal::FK_Normal))) + bool BadField; + if (!ParsingTemplateArgs) { // def, possibly in a multiclass + BadField = AddValue(CurRec, IdLoc, + RecordVal(DeclName, IdLoc, Type, + HasField ? RecordVal::FK_NonconcreteOK + : RecordVal::FK_Normal)); + + } else if (CurRec) { // class template argument + DeclName = QualifyName(*CurRec, CurMultiClass, DeclName, ":"); + BadField = AddValue(CurRec, IdLoc, RecordVal(DeclName, IdLoc, Type, + RecordVal::FK_TemplateArg)); + + } else { // multiclass template argument + assert(CurMultiClass && "invalid context for template argument"); + DeclName = QualifyName(CurMultiClass->Rec, CurMultiClass, DeclName, "::"); + BadField = AddValue(CurRec, IdLoc, RecordVal(DeclName, IdLoc, Type, + RecordVal::FK_TemplateArg)); + } + if (BadField) return nullptr; - // If a value is present, parse it. + // If a value is present, parse it and set new field's value. if (consume(tgtok::equal)) { SMLoc ValLoc = Lex.getLoc(); Init *Val = ParseValue(CurRec, Type); @@ -2715,7 +2682,7 @@ VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) { if (!Ranges.empty()) { assert(!IterType && "Type already initialized?"); IterType = IntRecTy::get(); - std::vector Values; + std::vector Values; for (unsigned R : Ranges) Values.push_back(IntInit::get(R)); ForeachListValue = ListInit::get(Values, IterType); @@ -2729,7 +2696,7 @@ VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) { /// ParseTemplateArgList - Read a template argument list, which is a non-empty /// sequence of template-declarations in <>'s. If CurRec is non-null, these are -/// template args for a def, which may or may not be in a multiclass. If null, +/// template args for a class, which may or may not be in a multiclass. If null, /// these are the template args for a multiclass. /// /// TemplateArgList ::= '<' Declaration (',' Declaration)* '>' @@ -3493,32 +3460,28 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) { while (true) { if (!Ref.Rec) return true; - // To instantiate a multiclass, we need to first get the multiclass, then - // instantiate each def contained in the multiclass with the SubClassRef - // template parameters. + // To instantiate a multiclass, we get the multiclass and then loop + // through its template argument names. Substs contains a substitution + // value for each argument, either the value specified or the default. + // Then we can resolve the template arguments. MultiClass *MC = MultiClasses[std::string(Ref.Rec->getName())].get(); assert(MC && "Didn't lookup multiclass correctly?"); - ArrayRef TemplateVals = Ref.TemplateArgs; - // Verify that the correct number of template arguments were specified. + ArrayRef TemplateVals = Ref.TemplateArgs; ArrayRef TArgs = MC->Rec.getTemplateArgs(); - if (TArgs.size() < TemplateVals.size()) - return Error(SubClassLoc, - "more template args specified than multiclass expects"); - SubstStack Substs; + for (unsigned i = 0, e = TArgs.size(); i != e; ++i) { if (i < TemplateVals.size()) { Substs.emplace_back(TArgs[i], TemplateVals[i]); } else { Init *Default = MC->Rec.getValue(TArgs[i])->getValue(); - if (!Default->isComplete()) { + if (!Default->isComplete()) return Error(SubClassLoc, - "value not specified for template argument #" + - Twine(i) + " (" + TArgs[i]->getAsUnquotedString() + - ") of multiclass '" + MC->Rec.getNameInitAsString() + - "'"); - } + "value not specified for template argument '" + + TArgs[i]->getAsUnquotedString() + "' (#" + + Twine(i) + ") of multiclass '" + + MC->Rec.getNameInitAsString() + "'"); Substs.emplace_back(TArgs[i], Default); } } @@ -3537,7 +3500,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) { SubClassLoc = Lex.getLoc(); - // A defm can inherit from regular classes (non-multiclass) as + // A defm can inherit from regular classes (non-multiclasses) as // long as they come in the end of the inheritance list. InheritFromClass = (Records.getClass(Lex.getCurStrVal()) != nullptr); @@ -3642,6 +3605,41 @@ bool TGParser::ParseFile() { return TokError("Unexpected token at top level"); } +// Check the types of the template argument values for a class +// inheritance, multiclass invocation, or anonymous class invocation. +// If necessary, replace an argument with a cast to the required type. +// The argument count has already been checked. +bool TGParser::CheckTemplateArgValues(SmallVectorImpl &Values, + SMLoc Loc, Record *ArgsRec) { + + ArrayRef TArgs = ArgsRec->getTemplateArgs(); + + for (unsigned I = 0, E = Values.size(); I < E; ++I) { + RecordVal *Arg = ArgsRec->getValue(TArgs[I]); + RecTy *ArgType = Arg->getType(); + auto *Value = Values[I]; + + if (TypedInit *ArgValue = dyn_cast(Value)) { + auto *CastValue = ArgValue->getCastTo(ArgType); + if (CastValue) { + assert((!isa(CastValue) || + cast(CastValue)->getType()->typeIsA(ArgType)) && + "result of template arg value cast has wrong type"); + Values[I] = CastValue; + } else { + PrintFatalError(Loc, + "Value specified for template argument '" + + Arg->getNameInitAsString() + "' (#" + Twine(I) + + ") is of type " + ArgValue->getType()->getAsString() + + "; expected type " + ArgType->getAsString() + ": " + + ArgValue->getAsString()); + } + } + } + + return false; +} + // Check an assertion: Obtain the condition value and be sure it is true. // If not, print a nonfatal error along with the message. void TGParser::CheckAssert(SMLoc Loc, Init *Condition, Init *Message) { diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h index 578a56c9d01c..5b847ab7344f 100644 --- a/llvm/lib/TableGen/TGParser.h +++ b/llvm/lib/TableGen/TGParser.h @@ -243,8 +243,10 @@ private: // Parser methods. IDParseMode Mode = ParseValueMode); Init *ParseValue(Record *CurRec, RecTy *ItemType = nullptr, IDParseMode Mode = ParseValueMode); - void ParseValueList(SmallVectorImpl &Result, Record *CurRec, - Record *ArgsRec = nullptr, RecTy *EltTy = nullptr); + void ParseValueList(SmallVectorImpl &Result, + Record *CurRec, RecTy *ItemType = nullptr); + bool ParseTemplateArgValueList(SmallVectorImpl &Result, + Record *CurRec, Record *ArgsRec); void ParseDagArgList( SmallVectorImpl> &Result, Record *CurRec); @@ -264,6 +266,8 @@ private: // Parser methods. MultiClass *ParseMultiClassID(); bool ApplyLetStack(Record *CurRec); bool ApplyLetStack(RecordsEntry &Entry); + bool CheckTemplateArgValues(SmallVectorImpl &Values, + SMLoc Loc, Record *ArgsRec); void CheckAssert(SMLoc Loc, Init *Condition, Init *Message); void CheckRecordAsserts(Record &Rec); }; diff --git a/llvm/test/TableGen/self-reference-typeerror.td b/llvm/test/TableGen/self-reference-typeerror.td index 35c6131fa2c9..6f8da4dae135 100644 --- a/llvm/test/TableGen/self-reference-typeerror.td +++ b/llvm/test/TableGen/self-reference-typeerror.td @@ -1,13 +1,14 @@ // RUN: not llvm-tblgen %s 2>&1 | FileCheck %s // XFAIL: vg_leak -class A { - A a = x; +class Cl { + Cl Arec = rec; } // At the time A0 is referenced, A has not yet been established as a superclass. // This kind of self-reference is discourage, but if you *really* want it, you // can force it with !cast. // -// CHECK: Field 'A:x' of type 'A' is incompatible with value -def A0 : A; +// CHECK: alue specified for template argument 'Cl:rec' + +def Rec0 : Cl; diff --git a/llvm/test/TableGen/template-args.td b/llvm/test/TableGen/template-args.td new file mode 100644 index 000000000000..2a931adffe9a --- /dev/null +++ b/llvm/test/TableGen/template-args.td @@ -0,0 +1,142 @@ +// RUN: llvm-tblgen %s | FileCheck %s +// RUN: not llvm-tblgen -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s +// RUN: not llvm-tblgen -DERROR2 %s 2>&1 | FileCheck --check-prefix=ERROR2 %s +// RUN: not llvm-tblgen -DERROR3 %s 2>&1 | FileCheck --check-prefix=ERROR3 %s +// RUN: not llvm-tblgen -DERROR4 %s 2>&1 | FileCheck --check-prefix=ERROR4 %s +// RUN: not llvm-tblgen -DERROR5 %s 2>&1 | FileCheck --check-prefix=ERROR5 %s +// RUN: not llvm-tblgen -DERROR6 %s 2>&1 | FileCheck --check-prefix=ERROR6 %s + +// This file tests that template arguments are type-checked and cast +// if necessary. + +// Class template arguments. + +class Class1 { + string Name = nm; +} + +// CHECK: def Rec1 +// CHECK: string Name = "Alice" +// CHECK: string NameName = "AliceAlice" + +def Rec1 : Class1<"Alice"> { + string NameName = Name # Name; +} + +#ifdef ERROR1 +// ERROR1: Value specified for template argument 'Class1:nm' (#0) is of type int + +def Rec2 : Class1<42> { +} +#endif + +class Class2 cd> { + int Code = cd; +} + +// CHECK: def Rec3 +// CHECK: int Code = 42 +// CHECK: list CodeList = [42] + +def Rec3 : Class2<0b00101010> { + list CodeList = [Code]; +} + +// CHECK: def Rec4 +// CHECK: int Code = 42 +// CHECK: list CodeList = [42] + +def Rec4 : Class2<42> { + list CodeList = [Code]; +} + +#ifdef ERROR2 +// ERROR2: Value specified for template argument 'Class2:cd' (#0) is of type string + +def Rec5 : Class2<"oops"> { + list CodeList = [Code]; +} +#endif + +// Anonymous class instantiation template arguments. + +// CHECK: def Rec6 +// CHECK: string Name = "Ted" + +def Rec6 { + string Name = Class1<"Ted">.Name; +} + +#ifdef ERROR3 +// ERROR3: Value specified for template argument 'Class1:nm' (#0) is of type int + +def Rec7 { + string Name = Class1<42>.Name; +} +#endif + +// CHECK: def Rec8 +// CHECK: list CodeList = [42] + +def Rec8 { + list CodeList = [Class2<42>.Code]; +} + +#ifdef ERROR4 +// ERROR4: Value specified for template argument 'Class2:cd' (#0) is of type string + +def Rec9 { + list CodeList = [Class2<"huh?">.Code]; +} +#endif + +// Multiclass template arguments. + +multiclass MC1 { + def _1 { + string Name = nm; + } + def _2 { + string NameNmae = nm # nm; + } +} + +// CHECK: def RecMC1_1 +// CHECK: string Name = "Carol" +// CHECK: def RecMC1_2 +// CHECK: string NameNmae = "CarolCarol" + +defm RecMC1 : MC1<"Carol">; + +#ifdef ERROR5 +// ERROR5: Value specified for template argument 'MC1::nm' (#0) is of type int + +defm RecMC2 : MC1<42>; +#endif + +multiclass MC2 cd> { + def _1 { + bits<8> Code = cd; + } + def _2 { + int Code = cd; + } + def _3 { + list CodeList = [cd]; + } +} + +// CHECK: def RecMC3_1 +// CHECK: bits<8> Code = { 0, 0, 1, 0, 1, 0, 1, 0 } +// CHECK: def RecMC3_2 +// CHECK: int Code = 42 +// CHECK: def RecMC3_3 +// CHECK: list CodeList = [42] + +defm RecMC3 : MC2<42>; + +#ifdef ERROR6 +// ERROR6: Value specified for template argument 'MC2::cd' (#0) is of type string + +defm RecMC4 : MC2<"Bob">; +#endif -- GitLab From aee005f9128adeda48c5f16d2cd04cde49b79105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 26 Feb 2021 15:10:53 +0200 Subject: [PATCH 0145/1000] [libcxx] [test] Fix windows errors in fs.op.rename Differential Revision: https://reviews.llvm.org/D98640 --- .../filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp index c1491581b11c..b6930d8d5fa9 100644 --- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp +++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.rename/rename.pass.cpp @@ -62,7 +62,13 @@ TEST_CASE(test_error_reporting) } cases[] = { {dne, dne}, {file, dir}, - {dir, file} +#ifndef _WIN32 + // The spec doesn't say that this case must be an error; fs.op.rename + // note 1.2.1 says that a file may be overwritten by a rename. + // On Windows, with rename() implemented with MoveFileExW, overwriting + // a file with a directory is not an error. + {dir, file}, +#endif }; for (auto& TC : cases) { auto from_before = status(TC.from); -- GitLab From b982c6f5fa1bd8762554dbc79bf16b9449ca095a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 16 Mar 2021 14:37:28 +0200 Subject: [PATCH 0146/1000] [libcxx] [test] Avoid race conditions between tests regarding temp directories Prior to e0d01294bc124211a8ffb55e69162eb34a242680, all tests used a random directory name, but now it is deterministic, based on the test name. This change was done under the assumption that the filename portion of the cwd is unique across tests that use the filesystem test temporary directories. When running tests locally, the cwd of the test is something like "/test//Output/copy_assign.pass.cpp.dir", and the filename portion, "copy_assign.pass.cpp.dir", is used as base for the temp directory names. The change noted that there's a risk for race conditions if multiple threads within one test try to create temp directories in parallel, but that doesn't really happen in practice. However, if running tests with a large number of parallel workers, multiple tests with the same filename portion, e.g. "copy_assign.pass.cpp.dir", can run in parallel, leading to race conditions across processes. Therefore, add a hash of the full cwd to distinguish such cases from each other. Secondly, don't use two separate levels of temporary directories (/static_env.0). When cleaning up, only the individual directory is removed, leaving the empty intermediate directory behind littering the temp directory. Differential Revision: https://reviews.llvm.org/D98703 --- libcxx/test/support/filesystem_test_helper.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/libcxx/test/support/filesystem_test_helper.h b/libcxx/test/support/filesystem_test_helper.h index e1607fd61899..e87c43b9ea09 100644 --- a/libcxx/test/support/filesystem_test_helper.h +++ b/libcxx/test/support/filesystem_test_helper.h @@ -296,14 +296,17 @@ private: // sharing the same cwd). However, it is fairly unlikely to happen as // we generally don't use scoped_test_env from multiple threads, so // this is deemed acceptable. + // The cwd.filename() itself isn't unique across all tests in the suite, + // so start the numbering from a hash of the full cwd, to avoid + // different tests interfering with each other. static inline fs::path available_cwd_path() { fs::path const cwd = utils::getcwd(); fs::path const tmp = fs::temp_directory_path(); - fs::path const base = tmp / cwd.filename(); - int i = 0; - fs::path p = base / ("static_env." + std::to_string(i)); + std::string base = cwd.filename().string(); + size_t i = std::hash()(cwd.string()); + fs::path p = tmp / (base + "-static_env." + std::to_string(i)); while (utils::exists(p.string())) { - p = fs::path(base) / ("static_env." + std::to_string(++i)); + p = tmp / (base + "-static_env." + std::to_string(++i)); } return p; } -- GitLab From 2ec9239a7b1faf880a130d6e5146883b48c85681 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Wed, 14 Oct 2020 15:37:13 +0300 Subject: [PATCH 0147/1000] [libcxx] [test] Fix weakly_canonical for windows Differential Revision: https://reviews.llvm.org/D98643 --- .../weakly_canonical.pass.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp index 08d963fe6652..983ad7bf0137 100644 --- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp +++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp @@ -27,6 +27,7 @@ int main(int, char**) { static_test_env static_env; + fs::path root = fs::current_path().root_path(); // clang-format off struct { fs::path input; @@ -34,10 +35,10 @@ int main(int, char**) { } TestCases[] = { {"", fs::current_path()}, {".", fs::current_path()}, - {"/", "/"}, - {"/foo", "/foo"}, - {"/.", "/"}, - {"/./", "/"}, + {"/", root}, + {"/foo", root / "foo"}, + {"/.", root}, + {"/./", root}, {"a/b", fs::current_path() / "a/b"}, {"a", fs::current_path() / "a"}, {"a/b/", fs::current_path() / "a/b/"}, @@ -61,15 +62,17 @@ int main(int, char**) { bool Failed = false; for (auto& TC : TestCases) { ++ID; - fs::path p(TC.input); + fs::path p = TC.input; + fs::path expect = TC.expect; + expect.make_preferred(); const fs::path output = fs::weakly_canonical(p); - if (!PathEq(output, TC.expect)) { + if (!PathEq(output, expect)) { Failed = true; std::fprintf(stderr, "TEST CASE #%d FAILED:\n" " Input: '%s'\n" " Expected: '%s'\n" " Output: '%s'\n", - ID, TC.input.string().c_str(), TC.expect.string().c_str(), + ID, TC.input.string().c_str(), expect.string().c_str(), output.string().c_str()); } } -- GitLab From ffb28871037105c899f63726953b6c4e7aa7b148 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 19 Mar 2021 13:34:47 +0000 Subject: [PATCH 0148/1000] [DAG] Fold shuffle(bop(shuffle(x,y),shuffle(z,w)),undef) -> bop(shuffle'(x,y),shuffle'(z,w)) Followup to D96345, handle unary shuffles of binops (as well as binary shuffles) if we can merge the shuffle with inner operand shuffles. Differential Revision: https://reviews.llvm.org/D98646 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 +++++---- llvm/test/CodeGen/X86/haddsub-4.ll | 21 ++++++++-------- llvm/test/CodeGen/X86/haddsub-shuf.ll | 4 ---- .../test/CodeGen/X86/known-signbits-vector.ll | 24 +++++++------------ 4 files changed, 25 insertions(+), 35 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 382fc91285a0..16833c5977d7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21255,14 +21255,17 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { // Merge shuffles through binops if we are able to merge it with at least // one other shuffles. + // shuffle(bop(shuffle(x,y),shuffle(z,w)),undef) // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d))) unsigned SrcOpcode = N0.getOpcode(); - if (SrcOpcode == N1.getOpcode() && TLI.isBinOp(SrcOpcode) && - N->isOnlyUserOf(N0.getNode()) && N->isOnlyUserOf(N1.getNode())) { + if (TLI.isBinOp(SrcOpcode) && N->isOnlyUserOf(N0.getNode()) && + (N1.isUndef() || + (SrcOpcode == N1.getOpcode() && N->isOnlyUserOf(N1.getNode())))) { + // Get binop source ops, or just pass on the undef. SDValue Op00 = N0.getOperand(0); - SDValue Op10 = N1.getOperand(0); SDValue Op01 = N0.getOperand(1); - SDValue Op11 = N1.getOperand(1); + SDValue Op10 = N1.isUndef() ? N1 : N1.getOperand(0); + SDValue Op11 = N1.isUndef() ? N1 : N1.getOperand(1); // TODO: We might be able to relax the VT check but we don't currently // have any isBinOp() that has different result/ops VTs so play safe until // we have test coverage. diff --git a/llvm/test/CodeGen/X86/haddsub-4.ll b/llvm/test/CodeGen/X86/haddsub-4.ll index 2e077d6247ba..3784400e3086 100644 --- a/llvm/test/CodeGen/X86/haddsub-4.ll +++ b/llvm/test/CodeGen/X86/haddsub-4.ll @@ -123,26 +123,25 @@ define <8 x float> @hadd_reverse2_v8f32(<8 x float> %a0, <8 x float> %a1) { define <8 x float> @hadd_reverse3_v8f32(<8 x float> %a0, <8 x float> %a1) { ; SSE-LABEL: hadd_reverse3_v8f32: ; SSE: # %bb.0: -; SSE-NEXT: movaps %xmm0, %xmm4 -; SSE-NEXT: haddps %xmm2, %xmm4 -; SSE-NEXT: haddps %xmm3, %xmm1 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2,1,0] -; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,2,1,0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: movaps %xmm4, %xmm1 +; SSE-NEXT: haddps %xmm1, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0,3,2] +; SSE-NEXT: haddps %xmm0, %xmm2 +; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0,3,2] +; SSE-NEXT: movaps %xmm3, %xmm0 +; SSE-NEXT: movaps %xmm2, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: hadd_reverse3_v8f32: ; AVX1: # %bb.0: -; AVX1-NEXT: vhaddps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vhaddps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] -; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_reverse3_v8f32: ; AVX2: # %bb.0: -; AVX2-NEXT: vhaddps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4] +; AVX2-NEXT: vhaddps %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6] ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1] ; AVX2-NEXT: retq %shuf0 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> diff --git a/llvm/test/CodeGen/X86/haddsub-shuf.ll b/llvm/test/CodeGen/X86/haddsub-shuf.ll index 22007df8320a..429175a10818 100644 --- a/llvm/test/CodeGen/X86/haddsub-shuf.ll +++ b/llvm/test/CodeGen/X86/haddsub-shuf.ll @@ -525,7 +525,6 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v8i32b: @@ -615,7 +614,6 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubd %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v8i32b: @@ -705,7 +703,6 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphaddw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hadd_v16i16b: @@ -795,7 +792,6 @@ define <16 x i16> @hsub_v16i16b(<16 x i16> %a) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vphsubw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-LABEL: hsub_v16i16b: diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll index 18cd42c8c1de..bed0abf5a26b 100644 --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -513,9 +513,8 @@ define <4 x i32> @signbits_mask_ashr_smax(<4 x i32> %a0, <4 x i32> %a1) { ; X86: # %bb.0: ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; @@ -523,9 +522,8 @@ define <4 x i32> @signbits_mask_ashr_smax(<4 x i32> %a0, <4 x i32> %a1) { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -553,9 +551,8 @@ define <4 x i32> @signbits_mask_ashr_smin(<4 x i32> %a0, <4 x i32> %a1) { ; X86: # %bb.0: ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; @@ -563,9 +560,8 @@ define <4 x i32> @signbits_mask_ashr_smin(<4 x i32> %a0, <4 x i32> %a1) { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -593,9 +589,8 @@ define <4 x i32> @signbits_mask_ashr_umax(<4 x i32> %a0, <4 x i32> %a1) { ; X86: # %bb.0: ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; @@ -603,9 +598,8 @@ define <4 x i32> @signbits_mask_ashr_umax(<4 x i32> %a0, <4 x i32> %a1) { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; @@ -633,9 +627,8 @@ define <4 x i32> @signbits_mask_ashr_umin(<4 x i32> %a0, <4 x i32> %a1) { ; X86: # %bb.0: ; X86-NEXT: vpsrad $25, %xmm0, %xmm0 ; X86-NEXT: vpsrad $25, %xmm1, %xmm1 -; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-NEXT: vpand {{\.LCPI.*}}, %xmm0, %xmm0 ; X86-NEXT: retl ; @@ -643,9 +636,8 @@ define <4 x i32> @signbits_mask_ashr_umin(<4 x i32> %a0, <4 x i32> %a1) { ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vpsrad $25, %xmm0, %xmm0 ; X64-AVX1-NEXT: vpsrad $25, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpminud %xmm1, %xmm0, %xmm0 +; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; X64-AVX1-NEXT: retq ; -- GitLab From 72557476d459969dbee95252e73f6ff1dfcc46c5 Mon Sep 17 00:00:00 2001 From: Arthur O'Dwyer Date: Tue, 16 Mar 2021 12:51:24 -0400 Subject: [PATCH 0149/1000] [libc++] Consistency on _LIBCPP_CLANG_VER tests in . This came out of my review comments on D97283. This patch re-enables the use of `__is_fundamental`, `__is_signed`, etc. on non-Clang compilers. Previously, when we found that a builtin didn't work on old Clangs, we had been reacting by limiting its use to new Clangs (i.e., we'd also stop using it on new GCCs and new MSVCs, just because of the old Clang bug). I claim that this was unintentional. Notice that on Apple Clang, `_LIBCPP_COMPILER_CLANG` is defined and `_LIBCPP_CLANG_VER` is not defined (therefore `0` in arithmetic expressions). We assume that Apple Clang has all the bugs of all the Clangs. Differential Revision: https://reviews.llvm.org/D98720 --- libcxx/include/type_traits | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index 7477e6d143de..d028ca22fac0 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -834,8 +834,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_array_v // is_pointer -// In clang 10.0.0 and earlier __is_pointer didn't work with Objective-C types. -#if __has_keyword(__is_pointer) && _LIBCPP_CLANG_VER > 1000 +// Before Clang 11, __is_pointer didn't work for Objective-C types. +#if __has_keyword(__is_pointer) && !(defined(_LIBCPP_COMPILER_CLANG) && _LIBCPP_CLANG_VER < 1100) template struct _LIBCPP_TEMPLATE_VIS is_pointer : _BoolConstant<__is_pointer(_Tp)> { }; @@ -1129,9 +1129,9 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_arithmetic_v // is_fundamental -// In clang 9 and lower, this builtin did not work for nullptr_t. Additionally, in C++03 mode, -// nullptr isn't defined by the compiler so, this builtin won't work. -#if __has_keyword(__is_fundamental) && _LIBCPP_CLANG_VER > 900 && !defined(_LIBCPP_CXX03_LANG) +// Before Clang 10, __is_fundamental didn't work for nullptr_t. +// In C++03 nullptr_t is library-provided but must still count as "fundamental." +#if __has_keyword(__is_fundamental) && !(defined(_LIBCPP_COMPILER_CLANG) && _LIBCPP_CLANG_VER < 1000) && !defined(_LIBCPP_CXX03_LANG) template struct _LIBCPP_TEMPLATE_VIS is_fundamental : _BoolConstant<__is_fundamental(_Tp)> { }; @@ -1158,7 +1158,7 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_fundamental_v // is_scalar -// >= 11 because in C++03 nullptr isn't actually nullptr +// In C++03 nullptr_t is library-provided but must still count as "scalar." #if __has_keyword(__is_scalar) && !defined(_LIBCPP_CXX03_LANG) template @@ -1415,8 +1415,8 @@ template using type_identity_t = typename type_identity<_Tp>::type; // is_signed -// In clang 9 and earlier, this builtin did not work for floating points or enums -#if __has_keyword(__is_signed) && _LIBCPP_CLANG_VER > 900 +// Before Clang 10, __is_signed didn't work for floating-point types or enums. +#if __has_keyword(__is_signed) && !(defined(_LIBCPP_COMPILER_CLANG) && _LIBCPP_CLANG_VER < 1000) template struct _LIBCPP_TEMPLATE_VIS is_signed : _BoolConstant<__is_signed(_Tp)> { }; @@ -1451,8 +1451,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_signed_v // is_unsigned -// Before clang 13, __is_unsigned returned true for enums with signed underlying type -#if __has_keyword(__is_unsigned) && _LIBCPP_CLANG_VER >= 1300 +// Before Clang 13, __is_unsigned returned true for enums with signed underlying type. +#if __has_keyword(__is_unsigned) && !(defined(_LIBCPP_COMPILER_CLANG) && _LIBCPP_CLANG_VER < 1300) template struct _LIBCPP_TEMPLATE_VIS is_unsigned : _BoolConstant<__is_unsigned(_Tp)> { }; @@ -1462,7 +1462,7 @@ template _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_unsigned_v = __is_unsigned(_Tp); #endif -#else // __has_keyword(__is_unsigned) && _LIBCPP_CLANG_VER >= 1300 +#else // __has_keyword(__is_unsigned) template ::value> struct __libcpp_is_unsigned_impl : public _LIBCPP_BOOL_CONSTANT(_Tp(0) < _Tp(-1)) {}; @@ -1483,7 +1483,7 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_unsigned_v = is_unsigned<_Tp>::value; #endif -#endif // __has_keyword(__is_unsigned) && _LIBCPP_CLANG_VER >= 1300 +#endif // __has_keyword(__is_unsigned) // rank -- GitLab From 4532ab76c9e8577bb5b6697eca22d9a21b89304f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=BChnel?= Date: Wed, 24 Feb 2021 15:50:02 +0100 Subject: [PATCH 0150/1000] propose Chocolately as package manager Installing the Unix tools on Windows is quite painful. To make things easier, I explained how to use a package manager or a Docker image. Note: This still uses the GNUWin tools as explained on this page. Once we replace these with something else, we would also need to update the installation commands. Differential Revision: https://reviews.llvm.org/D97387 --- llvm/docs/GettingStartedVS.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/llvm/docs/GettingStartedVS.rst b/llvm/docs/GettingStartedVS.rst index 2ed4397ac39f..2178c41824ea 100644 --- a/llvm/docs/GettingStartedVS.rst +++ b/llvm/docs/GettingStartedVS.rst @@ -57,6 +57,20 @@ need `GnuWin32 `_ tools, too. Do not install the LLVM directory tree into a path containing spaces (e.g. ``C:\Documents and Settings\...``) as the configure step will fail. +To simplify the installation procedure, you can also use +`Chocolatey `_ as package manager. After the +`installation `_ of Chocolatey, run these +commands in an admin shell to install the required tools: + +.. code-block:: bat + + choco install -y ninja git cmake gnuwin python3 + pip3 install psutil + +There is also a Windows +`Dockerfile `_ +with the entire build tool chain. This can be used to test the build with a +tool chain different from your host installation or to create build servers. Getting Started =============== -- GitLab From 96e675bdd5c8bfef34135fb50bcc7f570f073639 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <1.int32@gmail.com> Date: Fri, 19 Mar 2021 15:46:20 +0100 Subject: [PATCH 0151/1000] [clang][ASTImporter] Add import support for SourceLocExpr. It is possible that imported `SourceLocExpr` can cause not expected behavior (if `__builtin_LINE()` is used together with `__LINE__` for example) but still it may be worth to import these because some projects use it. Reviewed By: teemperor Differential Revision: https://reviews.llvm.org/D98876 --- clang/lib/AST/ASTImporter.cpp | 16 ++++++++++++++++ clang/unittests/AST/ASTImporterTest.cpp | 18 ++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index d48e173eb3b3..bf3cb4c42873 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -574,6 +574,7 @@ namespace clang { // Importing expressions ExpectedStmt VisitExpr(Expr *E); + ExpectedStmt VisitSourceLocExpr(SourceLocExpr *E); ExpectedStmt VisitVAArgExpr(VAArgExpr *E); ExpectedStmt VisitChooseExpr(ChooseExpr *E); ExpectedStmt VisitGNUNullExpr(GNUNullExpr *E); @@ -6483,6 +6484,21 @@ ExpectedStmt ASTNodeImporter::VisitExpr(Expr *E) { return make_error(ImportError::UnsupportedConstruct); } +ExpectedStmt ASTNodeImporter::VisitSourceLocExpr(SourceLocExpr *E) { + Error Err = Error::success(); + auto BLoc = importChecked(Err, E->getBeginLoc()); + auto RParenLoc = importChecked(Err, E->getEndLoc()); + if (Err) + return std::move(Err); + auto ParentContextOrErr = Importer.ImportContext(E->getParentContext()); + if (!ParentContextOrErr) + return ParentContextOrErr.takeError(); + + return new (Importer.getToContext()) + SourceLocExpr(Importer.getToContext(), E->getIdentKind(), BLoc, RParenLoc, + *ParentContextOrErr); +} + ExpectedStmt ASTNodeImporter::VisitVAArgExpr(VAArgExpr *E) { Error Err = Error::success(); diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp index 43464cc0c9ca..8c4b982ec6d5 100644 --- a/clang/unittests/AST/ASTImporterTest.cpp +++ b/clang/unittests/AST/ASTImporterTest.cpp @@ -246,6 +246,24 @@ TEST_P(ImportPath, CycleAfterCycle) { EXPECT_FALSE(path.hasCycleAtBack()); } +const internal::VariadicDynCastAllOfMatcher sourceLocExpr; + +AST_MATCHER_P(SourceLocExpr, hasBuiltinStr, StringRef, Str) { + return Node.getBuiltinStr() == Str; +} + +TEST_P(ImportExpr, ImportSourceLocExpr) { + MatchVerifier Verifier; + testImport("void declToImport() { (void)__builtin_FILE(); }", Lang_CXX03, "", + Lang_CXX03, Verifier, + functionDecl(hasDescendant( + sourceLocExpr(hasBuiltinStr("__builtin_FILE"))))); + testImport("void declToImport() { (void)__builtin_COLUMN(); }", Lang_CXX03, + "", Lang_CXX03, Verifier, + functionDecl(hasDescendant( + sourceLocExpr(hasBuiltinStr("__builtin_COLUMN"))))); +} + TEST_P(ImportExpr, ImportStringLiteral) { MatchVerifier Verifier; testImport("void declToImport() { (void)\"foo\"; }", Lang_CXX03, "", -- GitLab From 57effe22050f48a490606d83daf07560948ece4c Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 19 Mar 2021 08:36:03 -0700 Subject: [PATCH 0152/1000] [AMDGPU] Remove dead glc1 handing in asm parser. NFC. --- llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 8a8831f22ff1..d9ce76c49e34 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -338,9 +338,6 @@ public: bool isGDS() const { return isImmTy(ImmTyGDS); } bool isLDS() const { return isImmTy(ImmTyLDS); } bool isCPol() const { return isImmTy(ImmTyCPol); } - // "CPol_GLC1" is a MatchClass of the CPOL_GLC1 operand with the default and - // forced value of the GLC operand. - bool isCPol_GLC1() const { return isImmTy(ImmTyCPol); } bool isSWZ() const { return isImmTy(ImmTySWZ); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } @@ -1620,7 +1617,6 @@ public: void cvtMtbuf(MCInst &Inst, const OperandVector &Operands); AMDGPUOperand::Ptr defaultCPol() const; - AMDGPUOperand::Ptr defaultCPol_GLC1() const; AMDGPUOperand::Ptr defaultSMRDOffset8() const; AMDGPUOperand::Ptr defaultSMEMOffset() const; @@ -6905,11 +6901,6 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCPol() const { return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCPol); } -AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCPol_GLC1() const { - return AMDGPUOperand::CreateImm(this, CPol::GLC, SMLoc(), - AMDGPUOperand::ImmTyCPol); -} - void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, -- GitLab From b8616e40daf7a4c910f5fc0201c7ddd64082aaf0 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Thu, 18 Mar 2021 17:00:14 +0000 Subject: [PATCH 0153/1000] [AMDGPU] Add atomic optimizer nouse tests Add some atomic optimizer tests where there is no use of the result of the atomic operation, which is a common case in real code. NFC. Differential Revision: https://reviews.llvm.org/D98952 --- .../atomic_optimizations_local_pointer.ll | 640 +++++++++++++----- 1 file changed, 476 insertions(+), 164 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index b797e3efc373..f3de201745d0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -575,6 +575,162 @@ entry: ret void } +define amdgpu_kernel void @add_i32_varying_nouse() { +; GFX7LESS-LABEL: add_i32_varying_nouse: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: ds_add_u32 v1, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: add_i32_varying_nouse: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8-NEXT: s_mov_b64 exec, s[0:1] +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz BB3_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_add_u32 v0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: BB3_2: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: add_i32_varying_nouse: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz BB3_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_add_u32 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: BB3_2: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: add_i32_varying_nouse: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_mov_b32 s0, s2 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz BB3_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: ds_add_u32 v0, v3 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: BB3_2: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: add_i32_varying_nouse: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s0, s1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB3_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: ds_add_u32 v0, v3 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: BB3_2: +; GFX1032-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw add i32 addrspace(3)* @local_var32, i32 %lane acq_rel + ret void +} + define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; ; @@ -587,7 +743,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB3_2 +; GFX7LESS-NEXT: s_cbranch_execz BB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -598,7 +754,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB3_2: +; GFX7LESS-NEXT: BB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -622,7 +778,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB3_2 +; GFX8-NEXT: s_cbranch_execz BB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -633,7 +789,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB3_2: +; GFX8-NEXT: BB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -656,7 +812,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB3_2 +; GFX9-NEXT: s_cbranch_execz BB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -666,7 +822,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB3_2: +; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -689,7 +845,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB3_2 +; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -701,7 +857,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB3_2: +; GFX1064-NEXT: BB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -721,7 +877,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB3_2 +; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -733,7 +889,7 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB3_2: +; GFX1032-NEXT: BB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -762,7 +918,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB4_2 +; GFX7LESS-NEXT: s_cbranch_execz BB5_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -777,7 +933,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB4_2: +; GFX7LESS-NEXT: BB5_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -805,7 +961,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB4_2 +; GFX8-NEXT: s_cbranch_execz BB5_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 @@ -820,7 +976,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB4_2: +; GFX8-NEXT: BB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 @@ -848,7 +1004,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB4_2 +; GFX9-NEXT: s_cbranch_execz BB5_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -862,7 +1018,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB4_2: +; GFX9-NEXT: BB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -890,7 +1046,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB4_2 +; GFX1064-NEXT: s_cbranch_execz BB5_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -906,7 +1062,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB4_2: +; GFX1064-NEXT: BB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -931,7 +1087,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB4_2 +; GFX1032-NEXT: s_cbranch_execz BB5_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -947,7 +1103,7 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 %additive ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB4_2: +; GFX1032-NEXT: BB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1062,7 +1218,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB6_2 +; GFX7LESS-NEXT: s_cbranch_execz BB7_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 @@ -1072,7 +1228,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB6_2: +; GFX7LESS-NEXT: BB7_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -1092,7 +1248,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB6_2 +; GFX8-NEXT: s_cbranch_execz BB7_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX8-NEXT: s_mul_i32 s2, s2, 5 @@ -1102,7 +1258,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB6_2: +; GFX8-NEXT: BB7_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 @@ -1123,7 +1279,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB6_2 +; GFX9-NEXT: s_cbranch_execz BB7_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: s_mul_i32 s2, s2, 5 @@ -1132,7 +1288,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB6_2: +; GFX9-NEXT: BB7_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 @@ -1153,7 +1309,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s3, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB6_2 +; GFX1064-NEXT: s_cbranch_execz BB7_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1164,7 +1320,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB6_2: +; GFX1064-NEXT: BB7_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1184,7 +1340,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB6_2 +; GFX1032-NEXT: s_cbranch_execz BB7_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1195,7 +1351,7 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB6_2: +; GFX1032-NEXT: BB7_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -1225,7 +1381,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB7_2 +; GFX7LESS-NEXT: s_cbranch_execz BB8_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) @@ -1236,7 +1392,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB7_2: +; GFX7LESS-NEXT: BB8_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 @@ -1257,7 +1413,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc -; GFX8-NEXT: s_cbranch_execz BB7_2 +; GFX8-NEXT: s_cbranch_execz BB8_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1268,7 +1424,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB7_2: +; GFX8-NEXT: BB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1289,7 +1445,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX9-NEXT: s_cbranch_execz BB7_2 +; GFX9-NEXT: s_cbranch_execz BB8_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1299,7 +1455,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB7_2: +; GFX9-NEXT: BB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s2, v0 @@ -1321,7 +1477,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GFX1064-NEXT: s_cbranch_execz BB7_2 +; GFX1064-NEXT: s_cbranch_execz BB8_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1333,7 +1489,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB7_2: +; GFX1064-NEXT: BB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1355,7 +1511,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB7_2 +; GFX1032-NEXT: s_cbranch_execz BB8_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 ; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1367,7 +1523,7 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 %subitive ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB7_2: +; GFX1032-NEXT: BB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1431,7 +1587,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB8_2 +; GFX8-NEXT: s_cbranch_execz BB9_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -1439,7 +1595,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB8_2: +; GFX8-NEXT: BB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -1482,14 +1638,14 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB8_2 +; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB8_2: +; GFX9-NEXT: BB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -1541,7 +1697,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB8_2 +; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -1551,7 +1707,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB8_2: +; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -1592,7 +1748,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB8_2 +; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -1601,7 +1757,7 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB8_2: +; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -1618,6 +1774,162 @@ entry: ret void } +define amdgpu_kernel void @sub_i32_varying_nouse() { +; GFX7LESS-LABEL: sub_i32_varying_nouse: +; GFX7LESS: ; %bb.0: ; %entry +; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo +; GFX7LESS-NEXT: s_mov_b32 m0, -1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: ds_sub_u32 v1, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: sub_i32_varying_nouse: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_not_b64 exec, exec +; GFX8-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: v_add_u32_dpp v1, vcc, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX8-NEXT: v_readlane_b32 s2, v1, 63 +; GFX8-NEXT: s_mov_b64 exec, s[0:1] +; GFX8-NEXT: s_mov_b32 s0, s2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz BB10_2 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: s_mov_b32 m0, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: ds_sub_u32 v0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: BB10_2: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: sub_i32_varying_nouse: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_not_b64 exec, exec +; GFX9-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-NEXT: v_readlane_b32 s2, v1, 63 +; GFX9-NEXT: s_mov_b64 exec, s[0:1] +; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz BB10_2 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: ds_sub_u32 v0, v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: BB10_2: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: sub_i32_varying_nouse: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s2, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s2 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GFX1064-NEXT: v_readlane_b32 s2, v1, 63 +; GFX1064-NEXT: s_mov_b64 exec, s[0:1] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_mov_b32 s0, s2 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz BB10_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1064-NEXT: v_mov_b32_e32 v3, s0 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: ds_sub_u32 v0, v3 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: BB10_2: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: sub_i32_varying_nouse: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: s_or_saveexec_b32 s0, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_readlane_b32 s1, v1, 31 +; GFX1032-NEXT: s_mov_b32 exec_lo, s0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s0, s1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB10_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo +; GFX1032-NEXT: v_mov_b32_e32 v3, s0 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: ds_sub_u32 v0, v3 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: BB10_2: +; GFX1032-NEXT: s_endpgm +entry: + %lane = call i32 @llvm.amdgcn.workitem.id.x() + %old = atomicrmw sub i32 addrspace(3)* @local_var32, i32 %lane acq_rel + ret void +} + define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; ; @@ -1630,7 +1942,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB9_2 +; GFX7LESS-NEXT: s_cbranch_execz BB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1641,7 +1953,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB9_2: +; GFX7LESS-NEXT: BB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 @@ -1665,7 +1977,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB9_2 +; GFX8-NEXT: s_cbranch_execz BB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -1676,7 +1988,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB9_2: +; GFX8-NEXT: BB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 @@ -1700,7 +2012,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB9_2 +; GFX9-NEXT: s_cbranch_execz BB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 @@ -1710,7 +2022,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB9_2: +; GFX9-NEXT: BB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s3, v2 @@ -1734,7 +2046,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s5, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB9_2 +; GFX1064-NEXT: s_cbranch_execz BB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1746,7 +2058,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB9_2: +; GFX1064-NEXT: BB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 @@ -1769,7 +2081,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB9_2 +; GFX1032-NEXT: s_cbranch_execz BB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1781,7 +2093,7 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB9_2: +; GFX1032-NEXT: BB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 @@ -1813,7 +2125,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB10_2 +; GFX7LESS-NEXT: s_cbranch_execz BB12_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1828,7 +2140,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB10_2: +; GFX7LESS-NEXT: BB12_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s6, -1 @@ -1856,7 +2168,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX8-NEXT: s_cbranch_execz BB10_2 +; GFX8-NEXT: s_cbranch_execz BB12_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 @@ -1871,7 +2183,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB10_2: +; GFX8-NEXT: BB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s4, s0 @@ -1899,7 +2211,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz BB10_2 +; GFX9-NEXT: s_cbranch_execz BB12_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1913,7 +2225,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB10_2: +; GFX9-NEXT: BB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -1941,7 +2253,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB10_2 +; GFX1064-NEXT: s_cbranch_execz BB12_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1957,7 +2269,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB10_2: +; GFX1064-NEXT: BB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1982,7 +2294,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB10_2 +; GFX1032-NEXT: s_cbranch_execz BB12_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo @@ -1998,7 +2310,7 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 %subitive ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB10_2: +; GFX1032-NEXT: BB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -2148,7 +2460,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB12_2 +; GFX8-NEXT: s_cbranch_execz BB14_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2156,7 +2468,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB12_2: +; GFX8-NEXT: BB14_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2199,14 +2511,14 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB12_2 +; GFX9-NEXT: s_cbranch_execz BB14_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB12_2: +; GFX9-NEXT: BB14_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2258,7 +2570,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB12_2 +; GFX1064-NEXT: s_cbranch_execz BB14_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2268,7 +2580,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB12_2: +; GFX1064-NEXT: BB14_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -2309,7 +2621,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB12_2 +; GFX1032-NEXT: s_cbranch_execz BB14_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -2318,7 +2630,7 @@ define amdgpu_kernel void @and_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB12_2: +; GFX1032-NEXT: BB14_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -2382,7 +2694,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB13_2 +; GFX8-NEXT: s_cbranch_execz BB15_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2390,7 +2702,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB13_2: +; GFX8-NEXT: BB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2433,14 +2745,14 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB13_2 +; GFX9-NEXT: s_cbranch_execz BB15_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB13_2: +; GFX9-NEXT: BB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2492,7 +2804,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB13_2 +; GFX1064-NEXT: s_cbranch_execz BB15_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2502,7 +2814,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB13_2: +; GFX1064-NEXT: BB15_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -2543,7 +2855,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB13_2 +; GFX1032-NEXT: s_cbranch_execz BB15_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -2552,7 +2864,7 @@ define amdgpu_kernel void @or_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB13_2: +; GFX1032-NEXT: BB15_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -2616,7 +2928,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB14_2 +; GFX8-NEXT: s_cbranch_execz BB16_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2624,7 +2936,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB14_2: +; GFX8-NEXT: BB16_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2667,14 +2979,14 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB14_2 +; GFX9-NEXT: s_cbranch_execz BB16_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB14_2: +; GFX9-NEXT: BB16_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2726,7 +3038,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB14_2 +; GFX1064-NEXT: s_cbranch_execz BB16_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2736,7 +3048,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB14_2: +; GFX1064-NEXT: BB16_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -2777,7 +3089,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB14_2 +; GFX1032-NEXT: s_cbranch_execz BB16_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -2786,7 +3098,7 @@ define amdgpu_kernel void @xor_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB14_2: +; GFX1032-NEXT: BB16_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -2850,7 +3162,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB15_2 +; GFX8-NEXT: s_cbranch_execz BB17_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -2858,7 +3170,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB15_2: +; GFX8-NEXT: BB17_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -2901,14 +3213,14 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB15_2 +; GFX9-NEXT: s_cbranch_execz BB17_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB15_2: +; GFX9-NEXT: BB17_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -2962,7 +3274,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB15_2 +; GFX1064-NEXT: s_cbranch_execz BB17_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -2972,7 +3284,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB15_2: +; GFX1064-NEXT: BB17_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3015,7 +3327,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB15_2 +; GFX1032-NEXT: s_cbranch_execz BB17_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3024,7 +3336,7 @@ define amdgpu_kernel void @max_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB15_2: +; GFX1032-NEXT: BB17_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3052,7 +3364,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB16_2 +; GFX7LESS-NEXT: s_cbranch_execz BB18_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -3061,7 +3373,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB16_2: +; GFX7LESS-NEXT: BB18_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -3087,7 +3399,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB16_2 +; GFX8-NEXT: s_cbranch_execz BB18_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3096,7 +3408,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB16_2: +; GFX8-NEXT: BB18_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3122,7 +3434,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB16_2 +; GFX9-NEXT: s_cbranch_execz BB18_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3130,7 +3442,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB16_2: +; GFX9-NEXT: BB18_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3156,7 +3468,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB16_2 +; GFX1064-NEXT: s_cbranch_execz BB18_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3166,7 +3478,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB16_2: +; GFX1064-NEXT: BB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -3189,7 +3501,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB16_2 +; GFX1032-NEXT: s_cbranch_execz BB18_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3199,7 +3511,7 @@ define amdgpu_kernel void @max_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB16_2: +; GFX1032-NEXT: BB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -3267,7 +3579,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB17_2 +; GFX8-NEXT: s_cbranch_execz BB19_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -3275,7 +3587,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB17_2: +; GFX8-NEXT: BB19_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3318,14 +3630,14 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB17_2 +; GFX9-NEXT: s_cbranch_execz BB19_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB17_2: +; GFX9-NEXT: BB19_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3379,7 +3691,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB17_2 +; GFX1064-NEXT: s_cbranch_execz BB19_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -3389,7 +3701,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB17_2: +; GFX1064-NEXT: BB19_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3432,7 +3744,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB17_2 +; GFX1032-NEXT: s_cbranch_execz BB19_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3441,7 +3753,7 @@ define amdgpu_kernel void @min_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB17_2: +; GFX1032-NEXT: BB19_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3469,7 +3781,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB18_2 +; GFX7LESS-NEXT: s_cbranch_execz BB20_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -3478,7 +3790,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB18_2: +; GFX7LESS-NEXT: BB20_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -3504,7 +3816,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB18_2 +; GFX8-NEXT: s_cbranch_execz BB20_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3513,7 +3825,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB18_2: +; GFX8-NEXT: BB20_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 @@ -3539,7 +3851,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB18_2 +; GFX9-NEXT: s_cbranch_execz BB20_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3547,7 +3859,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB18_2: +; GFX9-NEXT: BB20_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 @@ -3573,7 +3885,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB18_2 +; GFX1064-NEXT: s_cbranch_execz BB20_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3583,7 +3895,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB18_2: +; GFX1064-NEXT: BB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -3606,7 +3918,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB18_2 +; GFX1032-NEXT: s_cbranch_execz BB20_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3616,7 +3928,7 @@ define amdgpu_kernel void @min_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB18_2: +; GFX1032-NEXT: BB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -3684,7 +3996,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB19_2 +; GFX8-NEXT: s_cbranch_execz BB21_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -3692,7 +4004,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB19_2: +; GFX8-NEXT: BB21_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3735,14 +4047,14 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB19_2 +; GFX9-NEXT: s_cbranch_execz BB21_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB19_2: +; GFX9-NEXT: BB21_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3794,7 +4106,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB19_2 +; GFX1064-NEXT: s_cbranch_execz BB21_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -3804,7 +4116,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB19_2: +; GFX1064-NEXT: BB21_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -3845,7 +4157,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB19_2 +; GFX1032-NEXT: s_cbranch_execz BB21_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -3854,7 +4166,7 @@ define amdgpu_kernel void @umax_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB19_2: +; GFX1032-NEXT: BB21_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -3882,7 +4194,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB20_2 +; GFX7LESS-NEXT: s_cbranch_execz BB22_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -3891,7 +4203,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB20_2: +; GFX7LESS-NEXT: BB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -3916,7 +4228,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB20_2 +; GFX8-NEXT: s_cbranch_execz BB22_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3925,7 +4237,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB20_2: +; GFX8-NEXT: BB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -3950,7 +4262,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB20_2 +; GFX9-NEXT: s_cbranch_execz BB22_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3958,7 +4270,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB20_2: +; GFX9-NEXT: BB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -3983,7 +4295,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB20_2 +; GFX1064-NEXT: s_cbranch_execz BB22_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -3993,7 +4305,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB20_2: +; GFX1064-NEXT: BB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -4016,7 +4328,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB20_2 +; GFX1032-NEXT: s_cbranch_execz BB22_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4026,7 +4338,7 @@ define amdgpu_kernel void @umax_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB20_2: +; GFX1032-NEXT: BB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 @@ -4094,7 +4406,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB21_2 +; GFX8-NEXT: s_cbranch_execz BB23_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 @@ -4102,7 +4414,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB21_2: +; GFX8-NEXT: BB23_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 @@ -4145,14 +4457,14 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB21_2 +; GFX9-NEXT: s_cbranch_execz BB23_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB21_2: +; GFX9-NEXT: BB23_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 @@ -4204,7 +4516,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX1064-NEXT: s_cbranch_execz BB21_2 +; GFX1064-NEXT: s_cbranch_execz BB23_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v4, s7 @@ -4214,7 +4526,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB21_2: +; GFX1064-NEXT: BB23_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 @@ -4255,7 +4567,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB21_2 +; GFX1032-NEXT: s_cbranch_execz BB23_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v7, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v4, s4 @@ -4264,7 +4576,7 @@ define amdgpu_kernel void @umin_i32_varying(i32 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB21_2: +; GFX1032-NEXT: BB23_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 @@ -4292,7 +4604,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX7LESS-NEXT: s_cbranch_execz BB22_2 +; GFX7LESS-NEXT: s_cbranch_execz BB24_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 @@ -4301,7 +4613,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: BB22_2: +; GFX7LESS-NEXT: BB24_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 @@ -4326,7 +4638,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX8-NEXT: s_cbranch_execz BB22_2 +; GFX8-NEXT: s_cbranch_execz BB24_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4335,7 +4647,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: BB22_2: +; GFX8-NEXT: BB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -4360,7 +4672,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX9-NEXT: s_cbranch_execz BB22_2 +; GFX9-NEXT: s_cbranch_execz BB24_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4368,7 +4680,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: BB22_2: +; GFX9-NEXT: BB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -4393,7 +4705,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc -; GFX1064-NEXT: s_cbranch_execz BB22_2 +; GFX1064-NEXT: s_cbranch_execz BB24_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mov_b32_e32 v0, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4403,7 +4715,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: BB22_2: +; GFX1064-NEXT: BB24_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 @@ -4426,7 +4738,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX1032-NEXT: s_cbranch_execz BB22_2 +; GFX1032-NEXT: s_cbranch_execz BB24_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mov_b32_e32 v0, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo @@ -4436,7 +4748,7 @@ define amdgpu_kernel void @umin_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: BB22_2: +; GFX1032-NEXT: BB24_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -- GitLab From 9d2df964070700ae0d244e84572ac2275050e49a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 19 Mar 2021 16:02:31 +0000 Subject: [PATCH 0154/1000] [DAG] computeKnownBits - add ISD::MULHS/MULHU/SMUL_LOHI/UMUL_LOHI handling Reuse the existing KnownBits multiplication code to handle the 'extend + multiply + extract high bits' pattern for multiply-high ops. Noticed while looking at the codegen for D88785 / D98587 - the patch helps division-by-constant expansion code in particular, which suggests that we might have some further KnownBits div/rem cases we could handle - but this was far easier to implement. Differential Revision: https://reviews.llvm.org/D98857 --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 32 ++++ .../atomic_optimizations_global_pointer.ll | 46 +++--- .../atomic_optimizations_local_pointer.ll | 44 ++--- llvm/test/CodeGen/AMDGPU/sdiv64.ll | 41 +++-- llvm/test/CodeGen/AMDGPU/srem64.ll | 41 +++-- llvm/test/CodeGen/AMDGPU/udiv64.ll | 53 +++--- .../AMDGPU/urem-seteq-illegal-types.ll | 9 +- llvm/test/CodeGen/AMDGPU/urem64.ll | 31 ++-- llvm/test/CodeGen/ARM/select-imm.ll | 25 +-- .../CodeGen/M68k/Arith/divide-by-constant.ll | 1 - .../PowerPC/urem-seteq-illegal-types.ll | 3 +- llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll | 152 +++++++++--------- .../CodeGen/X86/2008-04-17-CoalescerBug.ll | 71 ++++---- llvm/test/CodeGen/X86/combine-udiv.ll | 27 ++-- ...of-two-or-zero-when-comparing-with-zero.ll | 49 ++---- .../CodeGen/X86/smul_fix_sat_constants.ll | 42 ++--- llvm/test/CodeGen/X86/umul_fix.ll | 13 +- 17 files changed, 319 insertions(+), 361 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index dedc25c079eb..f89c5571f82b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2979,6 +2979,38 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, Known = KnownBits::computeForMul(Known, Known2); break; } + case ISD::MULHU: { + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known = KnownBits::mulhu(Known, Known2); + break; + } + case ISD::MULHS: { + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + Known = KnownBits::mulhs(Known, Known2); + break; + } + case ISD::UMUL_LOHI: { + assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result"); + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (Op.getResNo() == 0) + Known = KnownBits::computeForMul(Known, Known2); + else + Known = KnownBits::mulhu(Known, Known2); + break; + } + case ISD::SMUL_LOHI: { + assert((Op.getResNo() == 0 || Op.getResNo() == 1) && "Unknown result"); + Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + if (Op.getResNo() == 0) + Known = KnownBits::computeForMul(Known, Known2); + else + Known = KnownBits::mulhs(Known, Known2); + break; + } case ISD::UDIV: { Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index aba4f7d80aa9..7db3e8a9ae8b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -609,14 +609,14 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GFX7LESS-NEXT: s_cbranch_execz BB3_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX7LESS-NEXT: s_mul_i32 s3, s2, 5 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -651,12 +651,12 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GFX89-NEXT: s_waitcnt lgkmcnt(0) ; GFX89-NEXT: s_mov_b32 s8, s2 ; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX89-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 ; GFX89-NEXT: s_mul_i32 s2, s2, 5 ; GFX89-NEXT: s_mov_b32 s11, 0xf000 ; GFX89-NEXT: s_mov_b32 s10, -1 ; GFX89-NEXT: s_mov_b32 s9, s3 ; GFX89-NEXT: v_mov_b32_e32 v1, s2 +; GFX89-NEXT: v_mov_b32_e32 v2, 0 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX89-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) @@ -687,10 +687,10 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GCN64-NEXT: s_cbranch_execz BB3_2 ; GCN64-NEXT: ; %bb.1: ; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GCN64-NEXT: v_mov_b32_e32 v2, 0 +; GCN64-NEXT: s_mul_i32 s6, s6, 5 ; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_mul_i32 s7, s6, 5 -; GCN64-NEXT: v_mul_hi_u32_u24_e64 v2, s6, 5 -; GCN64-NEXT: v_mov_b32_e32 v1, s7 +; GCN64-NEXT: v_mov_b32_e32 v1, s6 ; GCN64-NEXT: s_mov_b32 s10, -1 ; GCN64-NEXT: s_waitcnt lgkmcnt(0) ; GCN64-NEXT: s_mov_b32 s8, s2 @@ -724,10 +724,10 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GCN32-NEXT: s_cbranch_execz BB3_2 ; GCN32-NEXT: ; %bb.1: ; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 +; GCN32-NEXT: v_mov_b32_e32 v2, 0 +; GCN32-NEXT: s_mul_i32 s5, s5, 5 ; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_mul_i32 s6, s5, 5 -; GCN32-NEXT: v_mul_hi_u32_u24_e64 v2, s5, 5 -; GCN32-NEXT: v_mov_b32_e32 v1, s6 +; GCN32-NEXT: v_mov_b32_e32 v1, s5 ; GCN32-NEXT: s_mov_b32 s10, -1 ; GCN32-NEXT: s_waitcnt lgkmcnt(0) ; GCN32-NEXT: s_mov_b32 s8, s2 @@ -1700,14 +1700,14 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GFX7LESS-NEXT: s_cbranch_execz BB9_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX7LESS-NEXT: s_mul_i32 s3, s2, 5 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -1742,12 +1742,12 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s8, s2 ; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 ; GFX8-NEXT: s_mul_i32 s2, s2, 5 ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 ; GFX8-NEXT: s_mov_b32 s9, s3 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -1781,12 +1781,12 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 ; GFX9-NEXT: s_mul_i32 s2, s2, 5 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 ; GFX9-NEXT: s_mov_b32 s9, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -1818,10 +1818,10 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GCN64-NEXT: s_cbranch_execz BB9_2 ; GCN64-NEXT: ; %bb.1: ; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GCN64-NEXT: v_mov_b32_e32 v2, 0 +; GCN64-NEXT: s_mul_i32 s6, s6, 5 ; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_mul_i32 s7, s6, 5 -; GCN64-NEXT: v_mul_hi_u32_u24_e64 v2, s6, 5 -; GCN64-NEXT: v_mov_b32_e32 v1, s7 +; GCN64-NEXT: v_mov_b32_e32 v1, s6 ; GCN64-NEXT: s_mov_b32 s10, -1 ; GCN64-NEXT: s_waitcnt lgkmcnt(0) ; GCN64-NEXT: s_mov_b32 s8, s2 @@ -1858,10 +1858,10 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GCN32-NEXT: s_cbranch_execz BB9_2 ; GCN32-NEXT: ; %bb.1: ; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 +; GCN32-NEXT: v_mov_b32_e32 v2, 0 +; GCN32-NEXT: s_mul_i32 s5, s5, 5 ; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_mul_i32 s6, s5, 5 -; GCN32-NEXT: v_mul_hi_u32_u24_e64 v2, s5, 5 -; GCN32-NEXT: v_mov_b32_e32 v1, s6 +; GCN32-NEXT: v_mov_b32_e32 v1, s5 ; GCN32-NEXT: s_mov_b32 s10, -1 ; GCN32-NEXT: s_waitcnt lgkmcnt(0) ; GCN32-NEXT: s_mov_b32 s8, s2 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index f3de201745d0..eadcb2a1eca2 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -746,10 +746,10 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_cbranch_execz BB4_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX7LESS-NEXT: s_mul_i32 s5, s4, 5 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -781,9 +781,9 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_cbranch_execz BB4_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -815,9 +815,9 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_cbranch_execz BB4_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -848,10 +848,10 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_cbranch_execz BB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mul_i32 s4, s4, 5 ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1064-NEXT: s_mul_i32 s5, s4, 5 -; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s5 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -880,10 +880,10 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_cbranch_execz BB4_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1032-NEXT: s_mul_i32 s4, s3, 5 -; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -1945,10 +1945,10 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX7LESS-NEXT: s_cbranch_execz BB11_2 ; GFX7LESS-NEXT: ; %bb.1: ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX7LESS-NEXT: s_mul_i32 s5, s4, 5 -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] @@ -1980,9 +1980,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX8-NEXT: s_cbranch_execz BB11_2 ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2015,9 +2015,9 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX9-NEXT: s_cbranch_execz BB11_2 ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX9-NEXT: s_mul_i32 s4, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] @@ -2049,10 +2049,10 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1064-NEXT: s_cbranch_execz BB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mul_i32 s4, s4, 5 ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1064-NEXT: s_mul_i32 s5, s4, 5 -; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v1, s5 +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] @@ -2084,10 +2084,10 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out) { ; GFX1032-NEXT: s_cbranch_execz BB11_2 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mul_i32 s3, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1032-NEXT: s_mul_i32 s4, s3, 5 -; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index 091959adcd71..8ae54e8d91ae 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1202,15 +1202,14 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v5, 24, v0 ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v6, 24, v3 -; GCN-NEXT: v_mul_hi_u32 v0, 0, v0 +; GCN-NEXT: v_mul_hi_u32 v0, 24, v0 +; GCN-NEXT: v_mul_hi_u32 v5, 24, v3 ; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v2, s8, v1 ; GCN-NEXT: v_mul_hi_u32 v3, s8, v0 @@ -1420,15 +1419,14 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-NEXT: v_addc_u32_e64 v4, vcc, v4, v6, s[4:5] ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v6, 24, v3 ; GCN-NEXT: v_mul_lo_u32 v5, v4, 24 -; GCN-NEXT: v_mul_hi_u32 v7, 24, v4 -; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 +; GCN-NEXT: v_mul_hi_u32 v3, 24, v3 +; GCN-NEXT: v_mul_hi_u32 v6, 24, v4 ; GCN-NEXT: v_mul_hi_u32 v4, 0, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v7, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v13, v6, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 @@ -1633,15 +1631,14 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, 0, v4, vcc -; GCN-NEXT: v_mul_hi_u32 v5, s4, v3 -; GCN-NEXT: v_mul_hi_u32 v6, s4, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v7, 15, v4 -; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 +; GCN-NEXT: v_mul_hi_u32 v3, s4, v3 +; GCN-NEXT: v_mul_hi_u32 v5, s4, v4 +; GCN-NEXT: v_lshlrev_b32_e32 v6, 15, v4 ; GCN-NEXT: v_mul_hi_u32 v4, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v13, v6, vcc -; GCN-NEXT: v_add_i32_e32 v5, vcc, 0, v5 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v6, v3, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v6 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GCN-NEXT: v_add_i32_e32 v3, vcc, 0, v3 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v12, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v6, v0, v3 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index cd0b7f77af43..261d466f0142 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1390,15 +1390,14 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v5, 24, v0 ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v6, 24, v3 -; GCN-NEXT: v_mul_hi_u32 v0, 0, v0 +; GCN-NEXT: v_mul_hi_u32 v0, 24, v0 +; GCN-NEXT: v_mul_hi_u32 v5, 24, v3 ; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v6, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v0, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v2, v5, vcc +; GCN-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, 0, v2, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GCN-NEXT: v_mul_lo_u32 v1, s8, v1 ; GCN-NEXT: v_mul_hi_u32 v2, s8, v0 @@ -1605,15 +1604,14 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v5, s[4:5] ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v5, 24, v2 ; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v6, 24, v3 -; GCN-NEXT: v_mul_hi_u32 v2, 0, v2 +; GCN-NEXT: v_mul_hi_u32 v2, 24, v2 +; GCN-NEXT: v_mul_hi_u32 v5, 24, v3 ; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v12, v6, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v12, v5, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0, v2 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 @@ -1816,15 +1814,14 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 ; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_hi_u32 v4, s4, v2 -; GCN-NEXT: v_mul_hi_u32 v5, s4, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v6, 15, v3 -; GCN-NEXT: v_mul_hi_u32 v2, 0, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 +; GCN-NEXT: v_mul_hi_u32 v2, s4, v2 +; GCN-NEXT: v_mul_hi_u32 v4, s4, v3 +; GCN-NEXT: v_lshlrev_b32_e32 v5, 15, v3 ; GCN-NEXT: v_mul_hi_u32 v3, 0, v3 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v12, v5, vcc -; GCN-NEXT: v_add_i32_e32 v4, vcc, 0, v4 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v5 +; GCN-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc +; GCN-NEXT: v_add_i32_e32 v2, vcc, 0, v2 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v11, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v0, v3 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 779c37b65a28..95303d81def0 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -969,14 +969,14 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 ; GCN-NEXT: v_mul_lo_u32 v7, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v3, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 @@ -999,27 +999,24 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] +; GCN-NEXT: v_addc_u32_e64 v1, vcc, v3, v1, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v3, v1, 24 ; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_hi_u32 v3, v3, 24 +; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 ; GCN-NEXT: v_mov_b32_e32 v5, s7 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v3, vcc -; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v2, s6, v1 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v3, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v4, s7, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v3, v2 +; GCN-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-NEXT: v_mul_lo_u32 v3, s6, v0 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v2 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v1 ; GCN-NEXT: v_sub_i32_e32 v3, vcc, 24, v3 ; GCN-NEXT: v_subb_u32_e64 v4, s[0:1], v4, v5, vcc ; GCN-NEXT: v_subrev_i32_e64 v5, s[0:1], s6, v3 @@ -1031,21 +1028,21 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], s7, v4 ; GCN-NEXT: v_cndmask_b32_e64 v4, v6, v5, s[0:1] ; GCN-NEXT: v_add_i32_e64 v5, s[0:1], 2, v0 -; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1] +; GCN-NEXT: v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc ; GCN-NEXT: v_add_i32_e64 v7, s[0:1], 1, v0 -; GCN-NEXT: v_subb_u32_e32 v2, vcc, 0, v2, vcc -; GCN-NEXT: v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1] +; GCN-NEXT: v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1] +; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v1 ; GCN-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v4 -; GCN-NEXT: v_cmp_le_u32_e32 vcc, s7, v2 -; GCN-NEXT: v_cndmask_b32_e64 v4, v8, v6, s[0:1] -; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc ; GCN-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v2 -; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v7, v5, s[0:1] -; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll index ea8068a8f5ad..eee9a4e69738 100644 --- a/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/urem-seteq-illegal-types.ll @@ -9,7 +9,7 @@ define i1 @test_urem_odd(i13 %X) nounwind { ; CHECK-NEXT: s_mov_b32 s4, 0xcccccccd ; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, 5 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -28,7 +28,7 @@ define i1 @test_urem_even(i27 %X) nounwind { ; CHECK-NEXT: s_mov_b32 s4, 0x92492493 ; CHECK-NEXT: v_mul_hi_u32 v0, v0, s4 ; CHECK-NEXT: v_lshrrev_b32_e32 v0, 2, v0 -; CHECK-NEXT: v_mul_lo_u32 v0, v0, 14 +; CHECK-NEXT: v_mul_u32_u24_e32 v0, 14, v0 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v1, v0 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -46,7 +46,7 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; CHECK-NEXT: s_mov_b32 s4, 0xcccccccd ; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 2, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, 5 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 5, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc @@ -62,10 +62,9 @@ define i1 @test_urem_negative_odd(i9 %X) nounwind { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 0x1ff, v0 ; CHECK-NEXT: s_mov_b32 s4, 0x2050c9f9 -; CHECK-NEXT: s_movk_i32 s5, 0x1fb ; CHECK-NEXT: v_mul_hi_u32 v1, v0, s4 ; CHECK-NEXT: v_lshrrev_b32_e32 v1, 6, v1 -; CHECK-NEXT: v_mul_lo_u32 v1, v1, s5 +; CHECK-NEXT: v_mul_u32_u24_e32 v1, 0x1fb, v1 ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index a0eba73e7d0f..8458512b3f75 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -779,14 +779,14 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_hi_u32 v5, v0, v6 ; GCN-NEXT: v_mul_lo_u32 v7, v0, v4 ; GCN-NEXT: v_mul_hi_u32 v9, v0, v4 -; GCN-NEXT: v_mul_lo_u32 v8, v3, v6 -; GCN-NEXT: v_mul_hi_u32 v6, v3, v6 +; GCN-NEXT: v_mul_hi_u32 v8, v3, v6 +; GCN-NEXT: v_mul_lo_u32 v6, v3, v6 ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v7 ; GCN-NEXT: v_mul_hi_u32 v10, v3, v4 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v2, v9, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v3, v4 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v8, v5 -; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc +; GCN-NEXT: v_add_i32_e32 v5, vcc, v6, v5 +; GCN-NEXT: v_addc_u32_e32 v5, vcc, v7, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v10, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_add_i32_e64 v0, s[0:1], v0, v4 @@ -809,27 +809,24 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x) ; GCN-NEXT: v_mul_lo_u32 v4, v4, v6 ; GCN-NEXT: v_add_i32_e32 v7, vcc, v10, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v11, v9, vcc -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v1, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v8, v1, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v7, v4 -; GCN-NEXT: v_addc_u32_e32 v6, vcc, v2, v6, vcc +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e64 v3, vcc, v3, v6, s[0:1] +; GCN-NEXT: v_addc_u32_e64 v1, vcc, v3, v1, s[0:1] ; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: v_mul_lo_u32 v4, v3, 24 -; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 -; GCN-NEXT: v_mul_hi_u32 v3, v3, 24 -; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v4 -; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v3, vcc ; GCN-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: v_mul_lo_u32 v1, s6, v1 +; GCN-NEXT: v_mul_lo_u32 v3, v1, 24 +; GCN-NEXT: v_mul_hi_u32 v0, v0, 24 +; GCN-NEXT: v_mul_hi_u32 v1, v1, 24 +; GCN-NEXT: v_add_i32_e32 v0, vcc, v0, v3 +; GCN-NEXT: v_addc_u32_e32 v0, vcc, v2, v1, vcc +; GCN-NEXT: v_mul_lo_u32 v1, s7, v0 ; GCN-NEXT: v_mul_hi_u32 v2, s6, v0 -; GCN-NEXT: v_mul_lo_u32 v3, s7, v0 ; GCN-NEXT: v_mul_lo_u32 v0, s6, v0 +; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_add_i32_e32 v1, vcc, v2, v1 -; GCN-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0, v1 -; GCN-NEXT: v_mov_b32_e32 v3, s7 ; GCN-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 ; GCN-NEXT: v_subb_u32_e64 v2, s[0:1], v2, v3, vcc ; GCN-NEXT: v_subrev_i32_e64 v4, s[0:1], s6, v0 diff --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll index 4682c2fb1bf0..b06394fa2bb9 100644 --- a/llvm/test/CodeGen/ARM/select-imm.ll +++ b/llvm/test/CodeGen/ARM/select-imm.ll @@ -684,29 +684,16 @@ define i1 @t11() { ; ARM: @ %bb.0: @ %entry ; ARM-NEXT: .pad #4 ; ARM-NEXT: sub sp, sp, #4 -; ARM-NEXT: ldr r0, .LCPI10_0 -; ARM-NEXT: mov r1, #33 -; ARM-NEXT: umull r2, r3, r1, r0 -; ARM-NEXT: lsr r0, r3, #3 -; ARM-NEXT: add r0, r0, r0, lsl #2 -; ARM-NEXT: sub r0, r1, r0, lsl #1 -; ARM-NEXT: ldr r1, [sp] -; ARM-NEXT: and r1, r1, #-33554432 -; ARM-NEXT: orr r0, r1, r0 -; ARM-NEXT: mov r1, #255 +; ARM-NEXT: ldr r0, [sp] +; ARM-NEXT: mov r1, #40960 +; ARM-NEXT: orr r1, r1, #-33554432 ; ARM-NEXT: orr r0, r0, #40960 -; ARM-NEXT: orr r1, r1, #3840 -; ARM-NEXT: str r0, [sp] ; ARM-NEXT: and r0, r0, r1 -; ARM-NEXT: sub r0, r0, #3 -; ARM-NEXT: rsbs r1, r0, #0 -; ARM-NEXT: adc r0, r0, r1 +; ARM-NEXT: orr r0, r0, #3 +; ARM-NEXT: str r0, [sp] +; ARM-NEXT: mov r0, #1 ; ARM-NEXT: add sp, sp, #4 ; ARM-NEXT: mov pc, lr -; ARM-NEXT: .p2align 2 -; ARM-NEXT: @ %bb.1: -; ARM-NEXT: .LCPI10_0: -; ARM-NEXT: .long 3435973837 @ 0xcccccccd ; ; ARMT2-LABEL: t11: ; ARMT2: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll b/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll index 04114d97e457..b695202f8ec2 100644 --- a/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll +++ b/llvm/test/CodeGen/M68k/Arith/divide-by-constant.ll @@ -44,7 +44,6 @@ define zeroext i8 @test3(i8 zeroext %x, i8 zeroext %c) { ; CHECK-NEXT: lsr.l #8, %d0 ; CHECK-NEXT: lsr.w #1, %d0 ; CHECK-NEXT: and.l #65535, %d0 -; CHECK-NEXT: and.l #255, %d0 ; CHECK-NEXT: rts entry: %div = udiv i8 %c, 3 diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll index c0f8749f78dc..40d402d424e6 100644 --- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll @@ -88,9 +88,8 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; PPC64LE-NEXT: clrlwi 3, 3, 28 ; PPC64LE-NEXT: ori 4, 4, 52429 ; PPC64LE-NEXT: mulhwu 4, 3, 4 -; PPC64LE-NEXT: rlwinm 5, 4, 0, 0, 29 ; PPC64LE-NEXT: srwi 4, 4, 2 -; PPC64LE-NEXT: add 4, 4, 5 +; PPC64LE-NEXT: rlwimi 4, 4, 2, 28, 29 ; PPC64LE-NEXT: sub 3, 3, 4 ; PPC64LE-NEXT: cntlzw 3, 3 ; PPC64LE-NEXT: not 3, 3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll index 1514fdafcfda..3b3471476dfa 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mla.ll @@ -551,13 +551,13 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u8 r0, q1[1] -; CHECK-NEXT: vmov.u8 r1, q1[0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.u8 r0, q1[3] +; CHECK-NEXT: vmov.u8 r1, q1[2] ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1 ; CHECK-NEXT: vand q3, q3, q2 @@ -566,55 +566,53 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-NEXT: vmov r1, s18 ; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: umull r12, r1, r1, r0 -; CHECK-NEXT: vmov.u8 r0, q1[2] +; CHECK-NEXT: umull r0, r1, r1, r0 ; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: orr.w lr, r3, r1 -; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: vmov q3[2], q3[0], r0, r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vand q3, q3, q2 -; CHECK-NEXT: vmov q4[2], q4[0], r1, r3 -; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov q3[2], q3[0], r2, r0 +; CHECK-NEXT: vmov.u8 r0, q1[0] +; CHECK-NEXT: vmov q3[3], q3[1], r3, r1 +; CHECK-NEXT: vmov.u8 r3, q1[1] +; CHECK-NEXT: vmov q4[2], q4[0], r0, r3 +; CHECK-NEXT: vmov.u8 r3, q0[1] +; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r1, s18 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: umull r0, r1, r1, r0 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: vmov q3[2], q3[0], r3, r0 -; CHECK-NEXT: vmov q3[3], q3[1], r4, r1 -; CHECK-NEXT: vmov.u8 r4, q0[4] -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov q5[2], q5[0], r2, r3 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vmov r3, s18 +; CHECK-NEXT: vmov r2, s20 +; CHECK-NEXT: vmov r4, s22 +; CHECK-NEXT: vmov lr, s12 +; CHECK-NEXT: vmov r12, s13 +; CHECK-NEXT: umull r0, r2, r2, r0 +; CHECK-NEXT: smlabb r0, r4, r3, r0 ; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: adc.w r0, r0, lr -; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: adds.w r0, r0, lr +; CHECK-NEXT: adc.w r2, r2, r12 +; CHECK-NEXT: adds.w r12, r0, r3 ; CHECK-NEXT: vmov.u8 r3, q1[4] -; CHECK-NEXT: adc.w r12, r0, r1 -; CHECK-NEXT: vmov.u8 r1, q1[5] -; CHECK-NEXT: vmov q3[2], q3[0], r3, r1 +; CHECK-NEXT: adcs r1, r2 +; CHECK-NEXT: vmov.u8 r2, q1[5] +; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 ; CHECK-NEXT: vmov.u8 r3, q0[5] ; CHECK-NEXT: vmov q4[2], q4[0], r4, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vmov r2, s14 ; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: umull r1, r3, r3, r1 +; CHECK-NEXT: umull r2, r3, r3, r2 ; CHECK-NEXT: umull r0, r4, r0, r4 -; CHECK-NEXT: vmov q3[2], q3[0], r0, r1 +; CHECK-NEXT: vmov q3[2], q3[0], r0, r2 ; CHECK-NEXT: vmov q3[3], q3[1], r4, r3 ; CHECK-NEXT: vmov.u8 r4, q0[6] -; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov r2, s12 ; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: adds r1, r1, r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: adc.w r0, r0, r12 +; CHECK-NEXT: adds.w r2, r2, r12 +; CHECK-NEXT: adcs r0, r1 +; CHECK-NEXT: vmov r1, s14 ; CHECK-NEXT: adds r1, r1, r2 ; CHECK-NEXT: vmov.u8 r2, q1[7] ; CHECK-NEXT: adc.w r12, r0, r3 @@ -722,7 +720,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) { ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: umlal r0, r1, r3, r2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> @@ -1466,58 +1464,56 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, lr} ; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vmov.u8 r2, q1[1] -; CHECK-NEXT: vmov.u8 r3, q1[0] +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.u8 r2, q1[3] +; CHECK-NEXT: vmov.u8 r3, q1[2] ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov.u8 r3, q0[3] +; CHECK-NEXT: vmov.u8 r2, q0[2] ; CHECK-NEXT: vmov.i64 q2, #0xff ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r12, s14 ; CHECK-NEXT: vmov r2, s18 -; CHECK-NEXT: vmov.u8 r4, q1[2] -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vmov.u8 r5, q0[2] -; CHECK-NEXT: umull r12, lr, r2, r12 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: umull r2, r3, r3, r2 -; CHECK-NEXT: orr.w lr, lr, r3 -; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: vmov q3[2], q3[0], r4, r3 -; CHECK-NEXT: vmov.u8 r4, q0[3] -; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 -; CHECK-NEXT: vand q3, q3, q2 +; CHECK-NEXT: vmov.u8 r4, q1[0] +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.u8 r5, q0[0] +; CHECK-NEXT: umull lr, r12, r2, r12 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: umull r2, r3, r2, r3 +; CHECK-NEXT: vmov q3[2], q3[0], r2, lr +; CHECK-NEXT: vmov.u8 r2, q1[1] +; CHECK-NEXT: vmov q4[2], q4[0], r4, r2 +; CHECK-NEXT: vmov.u8 r4, q0[1] +; CHECK-NEXT: vmov q5[2], q5[0], r5, r4 ; CHECK-NEXT: vand q4, q4, q2 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: add r2, r12 -; CHECK-NEXT: vmov r5, s12 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: umull r3, r4, r4, r3 -; CHECK-NEXT: umull r5, r6, r6, r5 -; CHECK-NEXT: vmov q3[2], q3[0], r5, r3 -; CHECK-NEXT: vmov.u8 r5, q1[4] -; CHECK-NEXT: vmov q3[3], q3[1], r6, r4 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: adds r2, r2, r6 +; CHECK-NEXT: vand q5, q5, q2 +; CHECK-NEXT: vmov r2, s16 +; CHECK-NEXT: vmov r4, s20 +; CHECK-NEXT: vmov q3[3], q3[1], r3, r12 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: vmov r6, s22 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov lr, s13 +; CHECK-NEXT: umull r2, r4, r4, r2 +; CHECK-NEXT: smlabb r2, r6, r5, r2 ; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: adc.w r3, r3, lr -; CHECK-NEXT: adds.w r12, r2, r6 +; CHECK-NEXT: vmov.u8 r5, q1[4] +; CHECK-NEXT: adds r2, r2, r3 +; CHECK-NEXT: adc.w r3, r4, lr +; CHECK-NEXT: vmov.u8 r4, q0[4] +; CHECK-NEXT: adds.w lr, r2, r6 ; CHECK-NEXT: vmov.u8 r6, q1[5] ; CHECK-NEXT: vmov q3[2], q3[0], r5, r6 -; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: vmov.u8 r5, q0[5] -; CHECK-NEXT: vmov.u8 r4, q0[4] ; CHECK-NEXT: vmov q4[2], q4[0], r4, r5 ; CHECK-NEXT: vand q3, q3, q2 ; CHECK-NEXT: vand q4, q4, q2 ; CHECK-NEXT: vmov r6, s14 ; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: adc.w r3, r3, r12 ; CHECK-NEXT: vmov r4, s12 ; CHECK-NEXT: vmov r2, s16 ; CHECK-NEXT: umull r6, r5, r5, r6 @@ -1527,7 +1523,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.u8 r4, q0[6] ; CHECK-NEXT: vmov r6, s12 ; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: adds.w r6, r6, r12 +; CHECK-NEXT: adds.w r6, r6, lr ; CHECK-NEXT: adcs r2, r3 ; CHECK-NEXT: vmov r3, s14 ; CHECK-NEXT: adds r3, r3, r6 @@ -1639,7 +1635,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: umlal r3, r2, r5, r6 ; CHECK-NEXT: adds r0, r0, r3 ; CHECK-NEXT: adcs r1, r2 -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: pop {r4, r5, r6, pc} entry: %xx = zext <16 x i8> %x to <16 x i64> diff --git a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll index 2ba3cf23774a..b8729a200a32 100644 --- a/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll +++ b/llvm/test/CodeGen/X86/2008-04-17-CoalescerBug.ll @@ -34,20 +34,20 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi ; CHECK-NEXT: movb {{[0-9]+}}(%esp), %bl ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.1: ## %bb116.i -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.2: ## %bb52.i.i ; CHECK-NEXT: testb $1, %bl -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.3: ## %bb142.i -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.4: ; CHECK-NEXT: movl L_.str89$non_lazy_ptr, %edi ; CHECK-NEXT: movb $1, %bh -; CHECK-NEXT: movl $274877907, %ebp ## imm = 0x10624DD3 +; CHECK-NEXT: movl L_.str$non_lazy_ptr, %ebp ; CHECK-NEXT: jmp LBB0_5 -; CHECK-NEXT: LBB0_23: ## %bb7806 +; CHECK-NEXT: LBB0_21: ## %bb7806 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp16: ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -58,7 +58,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: LBB0_5: ## %bb3261 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: cmpl $37, 0 -; CHECK-NEXT: jne LBB0_27 +; CHECK-NEXT: jne LBB0_25 ; CHECK-NEXT: ## %bb.6: ## %bb3306 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp0: @@ -70,7 +70,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: movl 0, %eax ; CHECK-NEXT: cmpl $121, %eax -; CHECK-NEXT: ja LBB0_27 +; CHECK-NEXT: ja LBB0_25 ; CHECK-NEXT: ## %bb.8: ## %bb3314 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: jmpl *LJTI0_0(,%eax,4) @@ -78,11 +78,11 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne LBB0_27 +; CHECK-NEXT: jne LBB0_25 ; CHECK-NEXT: ## %bb.11: ## %bb5809 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb %bh, %bh -; CHECK-NEXT: je LBB0_27 +; CHECK-NEXT: je LBB0_25 ; CHECK-NEXT: ## %bb.12: ## %bb91.i8504 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb $1, %bl @@ -98,10 +98,10 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: testb $1, %bl ; CHECK-NEXT: je LBB0_15 -; CHECK-NEXT: ## %bb.17: ## %bb278.i8617 +; CHECK-NEXT: ## %bb.16: ## %bb278.i8617 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: je LBB0_19 -; CHECK-NEXT: ## %bb.18: ## %bb440.i8663 +; CHECK-NEXT: je LBB0_18 +; CHECK-NEXT: ## %bb.17: ## %bb440.i8663 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp6: ; CHECK-NEXT: movl L_.str4$non_lazy_ptr, %eax @@ -110,39 +110,24 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl L__ZZNK10wxDateTime5GetTmERKNS_8TimeZoneEE12__FUNCTION__$non_lazy_ptr, %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) -; CHECK-NEXT: movl L_.str$non_lazy_ptr, %eax -; CHECK-NEXT: movl %eax, (%esp) +; CHECK-NEXT: movl %ebp, (%esp) ; CHECK-NEXT: movl $1717, {{[0-9]+}}(%esp) ## imm = 0x6B5 ; CHECK-NEXT: calll __Z10wxOnAssertPKwiPKcS0_S0_ ; CHECK-NEXT: Ltmp7: -; CHECK-NEXT: LBB0_19: ## %bb448.i8694 -; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: jmp LBB0_20 +; CHECK-NEXT: jmp LBB0_18 ; CHECK-NEXT: LBB0_15: ## %bb187.i8591 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: jne LBB0_27 -; CHECK-NEXT: ## %bb.16: ## %bb265.i8606 +; CHECK-NEXT: jne LBB0_25 +; CHECK-NEXT: LBB0_18: ## %invcont5814 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: imull %ebp -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: shrl $31, %eax -; CHECK-NEXT: shrl $6, %edx -; CHECK-NEXT: addl %eax, %edx -; CHECK-NEXT: imull $1000, %edx, %eax ## imm = 0x3E8 -; CHECK-NEXT: negl %eax -; CHECK-NEXT: LBB0_20: ## %invcont5814 -; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: movzwl %ax, %eax ; CHECK-NEXT: Ltmp8: -; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, (%esp) ; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: Ltmp9: -; CHECK-NEXT: ## %bb.21: ## %invcont5831 +; CHECK-NEXT: ## %bb.19: ## %invcont5831 ; CHECK-NEXT: ## in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: Ltmp10: ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -160,8 +145,8 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: movl %eax, (%esp) ; CHECK-NEXT: calll __ZNK10wxDateTime12GetDayOfYearERKNS_8TimeZoneE ; CHECK-NEXT: Ltmp14: -; CHECK-NEXT: jmp LBB0_27 -; CHECK-NEXT: LBB0_22: ## %bb5968 +; CHECK-NEXT: jmp LBB0_25 +; CHECK-NEXT: LBB0_20: ## %bb5968 ; CHECK-NEXT: Ltmp2: ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp) @@ -169,7 +154,7 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: calll __ZN8wxString6FormatEPKwz ; CHECK-NEXT: subl $4, %esp ; CHECK-NEXT: Ltmp3: -; CHECK-NEXT: LBB0_27: ## %bb115.critedge.i +; CHECK-NEXT: LBB0_25: ## %bb115.critedge.i ; CHECK-NEXT: movl %esi, %eax ; CHECK-NEXT: addl $28, %esp ; CHECK-NEXT: popl %esi @@ -177,15 +162,15 @@ define void @_ZNK10wxDateTime6FormatEPKwRKNS_8TimeZoneE(%struct.wxString* noalia ; CHECK-NEXT: popl %ebx ; CHECK-NEXT: popl %ebp ; CHECK-NEXT: retl $4 -; CHECK-NEXT: LBB0_25: ## %lpad.loopexit.split-lp +; CHECK-NEXT: LBB0_23: ## %lpad.loopexit.split-lp ; CHECK-NEXT: Ltmp15: -; CHECK-NEXT: jmp LBB0_27 -; CHECK-NEXT: LBB0_26: ## %lpad8185 +; CHECK-NEXT: jmp LBB0_25 +; CHECK-NEXT: LBB0_24: ## %lpad8185 ; CHECK-NEXT: Ltmp12: -; CHECK-NEXT: jmp LBB0_27 -; CHECK-NEXT: LBB0_24: ## %lpad.loopexit +; CHECK-NEXT: jmp LBB0_25 +; CHECK-NEXT: LBB0_22: ## %lpad.loopexit ; CHECK-NEXT: Ltmp18: -; CHECK-NEXT: jmp LBB0_27 +; CHECK-NEXT: jmp LBB0_25 ; CHECK-NEXT: Lfunc_end0: entry: br i1 %foo, label %bb116.i, label %bb115.critedge.i diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll index c44342d00357..c6e741540999 100644 --- a/llvm/test/CodeGen/X86/combine-udiv.ll +++ b/llvm/test/CodeGen/X86/combine-udiv.ll @@ -693,23 +693,20 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) { define <8 x i16> @pr38477(<8 x i16> %a0) { ; SSE2-LABEL: pr38477: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,4957,57457,4103,16385,35545,2048,2115] -; SSE2-NEXT: pmulhuw %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psubw %xmm2, %xmm1 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: paddw %xmm2, %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: pandn %xmm1, %xmm3 -; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm1 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: por %xmm3, %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4957,57457,4103,16385,35545,2048,2115] +; SSE2-NEXT: pmulhuw %xmm0, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: psubw %xmm1, %xmm0 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: paddw %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: pmulhuw {{.*}}(%rip), %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm0 +; SSE2-NEXT: por %xmm3, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: pr38477: diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll index af01df6436ec..534c7121ffcb 100644 --- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll +++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll @@ -91,13 +91,7 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -113,7 +107,7 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) { ; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE4-NEXT: psrld $2, %xmm2 -; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm2 ; SSE4-NEXT: psubd %xmm2, %xmm0 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 @@ -130,8 +124,7 @@ define <4 x i1> @p4_vector_urem_by_const__splat(<4 x i32> %x, <4 x i32> %y) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -156,19 +149,12 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [3,5,6,9] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1] -; SSE2-NEXT: pmuludq %xmm4, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: psubd %xmm1, %xmm0 +; SSE2-NEXT: psrld $2, %xmm1 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[1,2] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2,3,1] +; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2 +; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -187,7 +173,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) { ; SSE4-NEXT: psrld $2, %xmm2 ; SSE4-NEXT: psrld $1, %xmm1 ; SSE4-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5],xmm1[6,7] -; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm1 ; SSE4-NEXT: psubd %xmm1, %xmm0 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 @@ -204,7 +190,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX2-NEXT: vpsrlvd {{.*}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 @@ -292,13 +278,7 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; SSE2-NEXT: psrld $2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: pmaddwd {{.*}}(%rip), %xmm2 ; SSE2-NEXT: psubd %xmm2, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 @@ -314,7 +294,7 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32 ; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; SSE4-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] ; SSE4-NEXT: psrld $2, %xmm2 -; SSE4-NEXT: pmulld {{.*}}(%rip), %xmm2 +; SSE4-NEXT: pmaddwd {{.*}}(%rip), %xmm2 ; SSE4-NEXT: psubd %xmm2, %xmm0 ; SSE4-NEXT: pxor %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm0 @@ -331,8 +311,7 @@ define <4 x i1> @p7_vector_urem_by_const__nonsplat_undef2(<4 x i32> %x, <4 x i32 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] ; AVX2-NEXT: vpsrld $2, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [6,6,6,6] -; AVX2-NEXT: vpmulld %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpmaddwd {{.*}}(%rip), %xmm1, %xmm1 ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll index e92b8410e038..b8a46567e75b 100644 --- a/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll +++ b/llvm/test/CodeGen/X86/smul_fix_sat_constants.ll @@ -15,13 +15,13 @@ define i64 @func() nounwind { ; X64-NEXT: movl $2, %ecx ; X64-NEXT: movl $3, %eax ; X64-NEXT: imulq %rcx -; X64-NEXT: shrdq $2, %rdx, %rax ; X64-NEXT: cmpq $1, %rdx -; X64-NEXT: movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF -; X64-NEXT: cmovgq %rcx, %rax +; X64-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; X64-NEXT: movl $1, %ecx +; X64-NEXT: cmovgq %rax, %rcx ; X64-NEXT: cmpq $-2, %rdx -; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-NEXT: cmovlq %rcx, %rax +; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-NEXT: cmovgeq %rcx, %rax ; X64-NEXT: retq %tmp = call i64 @llvm.smul.fix.sat.i64(i64 3, i64 2, i32 2) ret i64 %tmp @@ -51,12 +51,12 @@ define i64 @func3() nounwind { ; X64-NEXT: movl $2, %edx ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: imulq %rdx -; X64-NEXT: shrdq $2, %rdx, %rax ; X64-NEXT: cmpq $1, %rdx -; X64-NEXT: cmovgq %rcx, %rax +; X64-NEXT: movabsq $4611686018427387903, %rsi # imm = 0x3FFFFFFFFFFFFFFF +; X64-NEXT: cmovgq %rcx, %rsi ; X64-NEXT: cmpq $-2, %rdx -; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-NEXT: cmovlq %rcx, %rax +; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: retq %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 2) ret i64 %tmp @@ -69,12 +69,12 @@ define i64 @func4() nounwind { ; X64-NEXT: movl $2, %edx ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: imulq %rdx -; X64-NEXT: shrdq $32, %rdx, %rax ; X64-NEXT: cmpq $2147483647, %rdx # imm = 0x7FFFFFFF -; X64-NEXT: cmovgq %rcx, %rax +; X64-NEXT: movl $4294967295, %esi # imm = 0xFFFFFFFF +; X64-NEXT: cmovgq %rcx, %rsi ; X64-NEXT: cmpq $-2147483648, %rdx # imm = 0x80000000 -; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-NEXT: cmovlq %rcx, %rax +; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: retq %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 32) ret i64 %tmp @@ -87,14 +87,14 @@ define i64 @func5() nounwind { ; X64-NEXT: movl $2, %edx ; X64-NEXT: movq %rcx, %rax ; X64-NEXT: imulq %rdx -; X64-NEXT: shrdq $63, %rdx, %rax -; X64-NEXT: movabsq $4611686018427387903, %rsi # imm = 0x3FFFFFFFFFFFFFFF -; X64-NEXT: cmpq %rsi, %rdx -; X64-NEXT: cmovgq %rcx, %rax -; X64-NEXT: movabsq $-4611686018427387904, %rcx # imm = 0xC000000000000000 -; X64-NEXT: cmpq %rcx, %rdx -; X64-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000 -; X64-NEXT: cmovlq %rcx, %rax +; X64-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; X64-NEXT: cmpq %rax, %rdx +; X64-NEXT: movl $1, %esi +; X64-NEXT: cmovgq %rcx, %rsi +; X64-NEXT: movabsq $-4611686018427387904, %rax # imm = 0xC000000000000000 +; X64-NEXT: cmpq %rax, %rdx +; X64-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000 +; X64-NEXT: cmovgeq %rsi, %rax ; X64-NEXT: retq %tmp = call i64 @llvm.smul.fix.sat.i64(i64 9223372036854775807, i64 2, i32 63) ret i64 %tmp diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll index 8481fe4ac6b4..37cdc49bdfad 100644 --- a/llvm/test/CodeGen/X86/umul_fix.ll +++ b/llvm/test/CodeGen/X86/umul_fix.ll @@ -77,11 +77,11 @@ define i4 @func3(i4 %x, i4 %y) nounwind { ; X64-LABEL: func3: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl $15, %esi -; X64-NEXT: andl $15, %eax -; X64-NEXT: imull %esi, %eax -; X64-NEXT: shrb $2, %al +; X64-NEXT: andb $15, %al +; X64-NEXT: andb $15, %sil ; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: mulb %sil +; X64-NEXT: shrb $2, %al ; X64-NEXT: retq ; ; X86-LABEL: func3: @@ -90,11 +90,8 @@ define i4 @func3(i4 %x, i4 %y) nounwind { ; X86-NEXT: andb $15, %al ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: andb $15, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: movzbl %al, %eax -; X86-NEXT: imull %ecx, %eax +; X86-NEXT: mulb %cl ; X86-NEXT: shrb $2, %al -; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl %tmp = call i4 @llvm.umul.fix.i4(i4 %x, i4 %y, i32 2) ret i4 %tmp -- GitLab From fb4f6057a637cabc687f7457d20a29da2c890ec0 Mon Sep 17 00:00:00 2001 From: Paul Robinson Date: Fri, 19 Mar 2021 07:43:36 -0700 Subject: [PATCH 0155/1000] [RGT] Recode more unreachable assertions and tautologies Count iterations of zero-trip loops and assert the count is zero, rather than asserting inside the loop. Unreachable functions should use llvm_unreachable. Remove tautological 'if' statements, even when they're following a pattern of checks. Found by the Rotten Green Tests project. --- llvm/unittests/ADT/BitVectorTest.cpp | 4 ++- llvm/unittests/ADT/ImmutableListTest.cpp | 1 - llvm/unittests/ADT/StringRefTest.cpp | 38 ++++++----------------- llvm/unittests/IR/BasicBlockTest.cpp | 4 ++- llvm/unittests/Linker/LinkModulesTest.cpp | 2 +- 5 files changed, 17 insertions(+), 32 deletions(-) diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp index 0f15a478e452..995f04e7efbb 100644 --- a/llvm/unittests/ADT/BitVectorTest.cpp +++ b/llvm/unittests/ADT/BitVectorTest.cpp @@ -1142,10 +1142,12 @@ TYPED_TEST(BitVectorTest, Iterators) { TypeParam Empty; EXPECT_EQ(Empty.set_bits_begin(), Empty.set_bits_end()); + int BitCount = 0; for (unsigned Bit : Empty.set_bits()) { (void)Bit; - EXPECT_TRUE(false); + BitCount++; } + ASSERT_EQ(BitCount, 0); TypeParam ToFill(100, false); ToFill.set(0); diff --git a/llvm/unittests/ADT/ImmutableListTest.cpp b/llvm/unittests/ADT/ImmutableListTest.cpp index ab3b8b472b90..28624c0d551d 100644 --- a/llvm/unittests/ADT/ImmutableListTest.cpp +++ b/llvm/unittests/ADT/ImmutableListTest.cpp @@ -245,7 +245,6 @@ TEST_F(ImmutableListTest, LongListOrderingTest) { int i = 0; for (ImmutableList>::iterator I = L.begin(), E = L.end(); I != E; ++I) { - ASSERT_EQ(i, *I); i++; } ASSERT_EQ(0, i); diff --git a/llvm/unittests/ADT/StringRefTest.cpp b/llvm/unittests/ADT/StringRefTest.cpp index 50e38c50f621..e3f943bdbf41 100644 --- a/llvm/unittests/ADT/StringRefTest.cpp +++ b/llvm/unittests/ADT/StringRefTest.cpp @@ -646,12 +646,8 @@ TEST(StringRefTest, getAsInteger) { ASSERT_TRUE(U32Success); } bool U64Success = StringRef(Unsigned[i].Str).getAsInteger(0, U64); - if (static_cast(Unsigned[i].Expected) == Unsigned[i].Expected) { - ASSERT_FALSE(U64Success); - EXPECT_EQ(U64, Unsigned[i].Expected); - } else { - ASSERT_TRUE(U64Success); - } + ASSERT_FALSE(U64Success); + EXPECT_EQ(U64, Unsigned[i].Expected); } int8_t S8; @@ -682,12 +678,8 @@ TEST(StringRefTest, getAsInteger) { ASSERT_TRUE(S32Success); } bool S64Success = StringRef(Signed[i].Str).getAsInteger(0, S64); - if (static_cast(Signed[i].Expected) == Signed[i].Expected) { - ASSERT_FALSE(S64Success); - EXPECT_EQ(S64, Signed[i].Expected); - } else { - ASSERT_TRUE(S64Success); - } + ASSERT_FALSE(S64Success); + EXPECT_EQ(S64, Signed[i].Expected); } } @@ -828,14 +820,9 @@ TEST(StringRefTest, consumeIntegerUnsigned) { Str = ConsumeUnsigned[i].Str; bool U64Success = Str.consumeInteger(0, U64); - if (static_cast(ConsumeUnsigned[i].Expected) == - ConsumeUnsigned[i].Expected) { - ASSERT_FALSE(U64Success); - EXPECT_EQ(U64, ConsumeUnsigned[i].Expected); - EXPECT_EQ(Str, ConsumeUnsigned[i].Leftover); - } else { - ASSERT_TRUE(U64Success); - } + ASSERT_FALSE(U64Success); + EXPECT_EQ(U64, ConsumeUnsigned[i].Expected); + EXPECT_EQ(Str, ConsumeUnsigned[i].Leftover); } } @@ -881,14 +868,9 @@ TEST(StringRefTest, consumeIntegerSigned) { Str = ConsumeSigned[i].Str; bool S64Success = Str.consumeInteger(0, S64); - if (static_cast(ConsumeSigned[i].Expected) == - ConsumeSigned[i].Expected) { - ASSERT_FALSE(S64Success); - EXPECT_EQ(S64, ConsumeSigned[i].Expected); - EXPECT_EQ(Str, ConsumeSigned[i].Leftover); - } else { - ASSERT_TRUE(S64Success); - } + ASSERT_FALSE(S64Success); + EXPECT_EQ(S64, ConsumeSigned[i].Expected); + EXPECT_EQ(Str, ConsumeSigned[i].Leftover); } } diff --git a/llvm/unittests/IR/BasicBlockTest.cpp b/llvm/unittests/IR/BasicBlockTest.cpp index fa923c90c729..408275732058 100644 --- a/llvm/unittests/IR/BasicBlockTest.cpp +++ b/llvm/unittests/IR/BasicBlockTest.cpp @@ -37,10 +37,12 @@ TEST(BasicBlockTest, PhiRange) { BranchInst::Create(BB.get(), BB2.get()); // Make sure this doesn't crash if there are no phis. + int PhiCount = 0; for (auto &PN : BB->phis()) { (void)PN; - EXPECT_TRUE(false) << "empty block should have no phis"; + PhiCount++; } + ASSERT_EQ(PhiCount, 0) << "empty block should have no phis"; // Make it a cycle. auto *BI = BranchInst::Create(BB.get(), BB.get()); diff --git a/llvm/unittests/Linker/LinkModulesTest.cpp b/llvm/unittests/Linker/LinkModulesTest.cpp index 05523c56cc2a..793c744a2df5 100644 --- a/llvm/unittests/Linker/LinkModulesTest.cpp +++ b/llvm/unittests/Linker/LinkModulesTest.cpp @@ -72,7 +72,7 @@ protected: }; static void expectNoDiags(const DiagnosticInfo &DI, void *C) { - EXPECT_TRUE(false); + llvm_unreachable("expectNoDiags called!"); } TEST_F(LinkModuleTest, BlockAddress) { -- GitLab From 1fe042041c451760437d3e4285820f4581f0b744 Mon Sep 17 00:00:00 2001 From: Jianzhou Zhao Date: Wed, 17 Mar 2021 16:22:01 +0000 Subject: [PATCH 0156/1000] [dfsan] Add origin ABI wrappers supported: dl_get_tls_static_info, calloc, clock_gettime, dfsan_set_write_callback, dl_iterato_phdr, dlopen, memcpy, memmove, memset, pread, read, strcat, strdup, strncpy This is a part of https://reviews.llvm.org/D95835. Reviewed By: morehouse Differential Revision: https://reviews.llvm.org/D98790 --- compiler-rt/lib/dfsan/dfsan.cpp | 7 + compiler-rt/lib/dfsan/dfsan.h | 4 + compiler-rt/lib/dfsan/dfsan_custom.cpp | 200 ++++++++++++++++++++++++- compiler-rt/test/dfsan/custom.cpp | 154 +++++++++++++++---- 4 files changed, 337 insertions(+), 28 deletions(-) diff --git a/compiler-rt/lib/dfsan/dfsan.cpp b/compiler-rt/lib/dfsan/dfsan.cpp index 5a9620aa417e..2aff8869d2cf 100644 --- a/compiler-rt/lib/dfsan/dfsan.cpp +++ b/compiler-rt/lib/dfsan/dfsan.cpp @@ -736,6 +736,13 @@ dfsan_read_origin_of_first_taint(const void *addr, uptr size) { return GetOriginIfTainted((uptr)addr, size); } +SANITIZER_INTERFACE_ATTRIBUTE void dfsan_set_label_origin(dfsan_label label, + dfsan_origin origin, + void *addr, + uptr size) { + __dfsan_set_label(label, origin, addr, size); +} + extern "C" SANITIZER_INTERFACE_ATTRIBUTE const struct dfsan_label_info *dfsan_get_label_info(dfsan_label label) { return &__dfsan_label_info[label]; diff --git a/compiler-rt/lib/dfsan/dfsan.h b/compiler-rt/lib/dfsan/dfsan.h index c2f173f079ff..73b4e4dcd297 100644 --- a/compiler-rt/lib/dfsan/dfsan.h +++ b/compiler-rt/lib/dfsan/dfsan.h @@ -48,6 +48,10 @@ void dfsan_clear_thread_local_state(); // from the address addr. dfsan_origin dfsan_read_origin_of_first_taint(const void *addr, uptr size); +// Set the data within [addr, addr+size) with label and origin. +void dfsan_set_label_origin(dfsan_label label, dfsan_origin origin, void *addr, + uptr size); + // Copy or move the origins of the len bytes from src to dst. void dfsan_mem_origin_transfer(const void *dst, const void *src, uptr len); } // extern "C" diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp index 804c57bd3c35..96b7668db90c 100644 --- a/compiler-rt/lib/dfsan/dfsan_custom.cpp +++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp @@ -470,6 +470,15 @@ SANITIZER_INTERFACE_ATTRIBUTE void *__dfsw_calloc(size_t nmemb, size_t size, return p; } +SANITIZER_INTERFACE_ATTRIBUTE void *__dfso_calloc( + size_t nmemb, size_t size, dfsan_label nmemb_label, dfsan_label size_label, + dfsan_label *ret_label, dfsan_origin nmemb_origin, dfsan_origin size_origin, + dfsan_origin *ret_origin) { + void *p = __dfsw_calloc(nmemb, size, nmemb_label, size_label, ret_label); + *ret_origin = 0; + return p; +} + SANITIZER_INTERFACE_ATTRIBUTE size_t __dfsw_strlen(const char *s, dfsan_label s_label, dfsan_label *ret_label) { size_t ret = strlen(s); @@ -499,6 +508,11 @@ static void *dfsan_memmove(void *dest, const void *src, size_t n) { return internal_memmove(dest, src, n); } +static void *dfsan_memmove_with_origin(void *dest, const void *src, size_t n) { + dfsan_mem_origin_transfer(dest, src, n); + return dfsan_memmove(dest, src, n); +} + static void *dfsan_memcpy(void *dest, const void *src, size_t n) { dfsan_label *sdest = shadow_for(dest); const dfsan_label *ssrc = shadow_for(src); @@ -506,11 +520,22 @@ static void *dfsan_memcpy(void *dest, const void *src, size_t n) { return internal_memcpy(dest, src, n); } +static void *dfsan_memcpy_with_origin(void *dest, const void *src, size_t n) { + dfsan_mem_origin_transfer(dest, src, n); + return dfsan_memcpy(dest, src, n); +} + static void dfsan_memset(void *s, int c, dfsan_label c_label, size_t n) { internal_memset(s, c, n); dfsan_set_label(c_label, s, n); } +static void dfsan_memset_with_origin(void *s, int c, dfsan_label c_label, + dfsan_origin c_origin, size_t n) { + internal_memset(s, c, n); + dfsan_set_label_origin(c_label, c_origin, s, n); +} + SANITIZER_INTERFACE_ATTRIBUTE void *__dfsw_memcpy(void *dest, const void *src, size_t n, dfsan_label dest_label, dfsan_label src_label, @@ -519,6 +544,17 @@ void *__dfsw_memcpy(void *dest, const void *src, size_t n, return dfsan_memcpy(dest, src, n); } +SANITIZER_INTERFACE_ATTRIBUTE +void *__dfso_memcpy(void *dest, const void *src, size_t n, + dfsan_label dest_label, dfsan_label src_label, + dfsan_label n_label, dfsan_label *ret_label, + dfsan_origin dest_origin, dfsan_origin src_origin, + dfsan_origin n_origin, dfsan_origin *ret_origin) { + *ret_label = dest_label; + *ret_origin = dest_origin; + return dfsan_memcpy_with_origin(dest, src, n); +} + SANITIZER_INTERFACE_ATTRIBUTE void *__dfsw_memmove(void *dest, const void *src, size_t n, dfsan_label dest_label, dfsan_label src_label, @@ -527,6 +563,17 @@ void *__dfsw_memmove(void *dest, const void *src, size_t n, return dfsan_memmove(dest, src, n); } +SANITIZER_INTERFACE_ATTRIBUTE +void *__dfso_memmove(void *dest, const void *src, size_t n, + dfsan_label dest_label, dfsan_label src_label, + dfsan_label n_label, dfsan_label *ret_label, + dfsan_origin dest_origin, dfsan_origin src_origin, + dfsan_origin n_origin, dfsan_origin *ret_origin) { + *ret_label = dest_label; + *ret_origin = dest_origin; + return dfsan_memmove_with_origin(dest, src, n); +} + SANITIZER_INTERFACE_ATTRIBUTE void *__dfsw_memset(void *s, int c, size_t n, dfsan_label s_label, dfsan_label c_label, @@ -536,6 +583,18 @@ void *__dfsw_memset(void *s, int c, size_t n, return s; } +SANITIZER_INTERFACE_ATTRIBUTE +void *__dfso_memset(void *s, int c, size_t n, dfsan_label s_label, + dfsan_label c_label, dfsan_label n_label, + dfsan_label *ret_label, dfsan_origin s_origin, + dfsan_origin c_origin, dfsan_origin n_origin, + dfsan_origin *ret_origin) { + dfsan_memset_with_origin(s, c, c_label, c_origin, n); + *ret_label = s_label; + *ret_origin = s_origin; + return s; +} + SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strcat(char *dest, const char *src, dfsan_label dest_label, dfsan_label src_label, @@ -550,6 +609,23 @@ SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_strcat(char *dest, const char *src, return ret; } +SANITIZER_INTERFACE_ATTRIBUTE char *__dfso_strcat( + char *dest, const char *src, dfsan_label dest_label, dfsan_label src_label, + dfsan_label *ret_label, dfsan_origin dest_origin, dfsan_origin src_origin, + dfsan_origin *ret_origin) { + size_t dest_len = strlen(dest); + char *ret = strcat(dest, src); // NOLINT + dfsan_label *sdest = shadow_for(dest + dest_len); + const dfsan_label *ssrc = shadow_for(src); + size_t src_len = strlen(src); + dfsan_mem_origin_transfer(dest + dest_len, src, src_len); + internal_memcpy((void *)sdest, (const void *)ssrc, + src_len * sizeof(dfsan_label)); + *ret_label = dest_label; + *ret_origin = dest_origin; + return ret; +} + SANITIZER_INTERFACE_ATTRIBUTE char * __dfsw_strdup(const char *s, dfsan_label s_label, dfsan_label *ret_label) { size_t len = strlen(s); @@ -559,6 +635,19 @@ __dfsw_strdup(const char *s, dfsan_label s_label, dfsan_label *ret_label) { return static_cast(p); } +SANITIZER_INTERFACE_ATTRIBUTE char *__dfso_strdup(const char *s, + dfsan_label s_label, + dfsan_label *ret_label, + dfsan_origin s_origin, + dfsan_origin *ret_origin) { + size_t len = strlen(s); + void *p = malloc(len + 1); + dfsan_memcpy_with_origin(p, s, len + 1); + *ret_label = 0; + *ret_origin = 0; + return static_cast(p); +} + SANITIZER_INTERFACE_ATTRIBUTE char * __dfsw_strncpy(char *s1, const char *s2, size_t n, dfsan_label s1_label, dfsan_label s2_label, dfsan_label n_label, @@ -575,6 +664,24 @@ __dfsw_strncpy(char *s1, const char *s2, size_t n, dfsan_label s1_label, return s1; } +SANITIZER_INTERFACE_ATTRIBUTE char *__dfso_strncpy( + char *s1, const char *s2, size_t n, dfsan_label s1_label, + dfsan_label s2_label, dfsan_label n_label, dfsan_label *ret_label, + dfsan_origin s1_origin, dfsan_origin s2_origin, dfsan_origin n_origin, + dfsan_origin *ret_origin) { + size_t len = strlen(s2); + if (len < n) { + dfsan_memcpy_with_origin(s1, s2, len + 1); + dfsan_memset_with_origin(s1 + len + 1, 0, 0, 0, n - len - 1); + } else { + dfsan_memcpy_with_origin(s1, s2, n); + } + + *ret_label = s1_label; + *ret_origin = s1_origin; + return s1; +} + SANITIZER_INTERFACE_ATTRIBUTE ssize_t __dfsw_pread(int fd, void *buf, size_t count, off_t offset, dfsan_label fd_label, dfsan_label buf_label, @@ -587,6 +694,17 @@ __dfsw_pread(int fd, void *buf, size_t count, off_t offset, return ret; } +SANITIZER_INTERFACE_ATTRIBUTE ssize_t __dfso_pread( + int fd, void *buf, size_t count, off_t offset, dfsan_label fd_label, + dfsan_label buf_label, dfsan_label count_label, dfsan_label offset_label, + dfsan_label *ret_label, dfsan_origin fd_origin, dfsan_origin buf_origin, + dfsan_origin count_origin, dfsan_label offset_origin, + dfsan_origin *ret_origin) { + ssize_t ret = __dfsw_pread(fd, buf, count, offset, fd_label, buf_label, + count_label, offset_label, ret_label); + return ret; +} + SANITIZER_INTERFACE_ATTRIBUTE ssize_t __dfsw_read(int fd, void *buf, size_t count, dfsan_label fd_label, dfsan_label buf_label, @@ -599,6 +717,16 @@ __dfsw_read(int fd, void *buf, size_t count, return ret; } +SANITIZER_INTERFACE_ATTRIBUTE ssize_t __dfso_read( + int fd, void *buf, size_t count, dfsan_label fd_label, + dfsan_label buf_label, dfsan_label count_label, dfsan_label *ret_label, + dfsan_origin fd_origin, dfsan_origin buf_origin, dfsan_origin count_origin, + dfsan_origin *ret_origin) { + ssize_t ret = + __dfsw_read(fd, buf, count, fd_label, buf_label, count_label, ret_label); + return ret; +} + SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_clock_gettime(clockid_t clk_id, struct timespec *tp, dfsan_label clk_id_label, @@ -611,7 +739,15 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_clock_gettime(clockid_t clk_id, return ret; } -static void unpoison(const void *ptr, uptr size) { +SANITIZER_INTERFACE_ATTRIBUTE int __dfso_clock_gettime( + clockid_t clk_id, struct timespec *tp, dfsan_label clk_id_label, + dfsan_label tp_label, dfsan_label *ret_label, dfsan_origin clk_id_origin, + dfsan_origin tp_origin, dfsan_origin *ret_origin) { + int ret = __dfsw_clock_gettime(clk_id, tp, clk_id_label, tp_label, ret_label); + return ret; +} + +static void dfsan_set_zero_label(const void *ptr, uptr size) { dfsan_set_label(0, const_cast(ptr), size); } @@ -624,11 +760,21 @@ __dfsw_dlopen(const char *filename, int flag, dfsan_label filename_label, void *handle = dlopen(filename, flag); link_map *map = GET_LINK_MAP_BY_DLOPEN_HANDLE(handle); if (map) - ForEachMappedRegion(map, unpoison); + ForEachMappedRegion(map, dfsan_set_zero_label); *ret_label = 0; return handle; } +SANITIZER_INTERFACE_ATTRIBUTE void *__dfso_dlopen( + const char *filename, int flag, dfsan_label filename_label, + dfsan_label flag_label, dfsan_label *ret_label, + dfsan_origin filename_origin, dfsan_origin flag_origin, + dfsan_origin *ret_origin) { + void *handle = + __dfsw_dlopen(filename, flag, filename_label, flag_label, ret_label); + return handle; +} + static void *DFsanThreadStartFunc(void *arg) { DFsanThread *t = (DFsanThread *)arg; SetCurrentThread(t); @@ -715,6 +861,17 @@ struct dl_iterate_phdr_info { void *data; }; +struct dl_iterate_phdr_origin_info { + int (*callback_trampoline)(void *callback, struct dl_phdr_info *info, + size_t size, void *data, dfsan_label info_label, + dfsan_label size_label, dfsan_label data_label, + dfsan_label *ret_label, dfsan_origin info_origin, + dfsan_origin size_origin, dfsan_origin data_origin, + dfsan_origin *ret_origin); + void *callback; + void *data; +}; + int dl_iterate_phdr_cb(struct dl_phdr_info *info, size_t size, void *data) { dl_iterate_phdr_info *dipi = (dl_iterate_phdr_info *)data; dfsan_set_label(0, *info); @@ -728,6 +885,21 @@ int dl_iterate_phdr_cb(struct dl_phdr_info *info, size_t size, void *data) { 0, &ret_label); } +int dl_iterate_phdr_origin_cb(struct dl_phdr_info *info, size_t size, + void *data) { + dl_iterate_phdr_origin_info *dipi = (dl_iterate_phdr_origin_info *)data; + dfsan_set_label(0, *info); + dfsan_set_label(0, const_cast(info->dlpi_name), + strlen(info->dlpi_name) + 1); + dfsan_set_label( + 0, const_cast(reinterpret_cast(info->dlpi_phdr)), + sizeof(*info->dlpi_phdr) * info->dlpi_phnum); + dfsan_label ret_label; + dfsan_origin ret_origin; + return dipi->callback_trampoline(dipi->callback, info, size, dipi->data, 0, 0, + 0, &ret_label, 0, 0, 0, &ret_origin); +} + SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_dl_iterate_phdr( int (*callback_trampoline)(void *callback, struct dl_phdr_info *info, size_t size, void *data, dfsan_label info_label, @@ -740,6 +912,23 @@ SANITIZER_INTERFACE_ATTRIBUTE int __dfsw_dl_iterate_phdr( return dl_iterate_phdr(dl_iterate_phdr_cb, &dipi); } +SANITIZER_INTERFACE_ATTRIBUTE int __dfso_dl_iterate_phdr( + int (*callback_trampoline)(void *callback, struct dl_phdr_info *info, + size_t size, void *data, dfsan_label info_label, + dfsan_label size_label, dfsan_label data_label, + dfsan_label *ret_label, dfsan_origin info_origin, + dfsan_origin size_origin, + dfsan_origin data_origin, + dfsan_origin *ret_origin), + void *callback, void *data, dfsan_label callback_label, + dfsan_label data_label, dfsan_label *ret_label, + dfsan_origin callback_origin, dfsan_origin data_origin, + dfsan_origin *ret_origin) { + dl_iterate_phdr_origin_info dipi = {callback_trampoline, callback, data}; + *ret_label = 0; + return dl_iterate_phdr(dl_iterate_phdr_origin_cb, &dipi); +} + // This function is only available for glibc 2.27 or newer. Mark it weak so // linking succeeds with older glibcs. SANITIZER_WEAK_ATTRIBUTE void _dl_get_tls_static_info(size_t *sizep, @@ -754,6 +943,13 @@ SANITIZER_INTERFACE_ATTRIBUTE void __dfsw__dl_get_tls_static_info( dfsan_set_label(0, alignp, sizeof(*alignp)); } +SANITIZER_INTERFACE_ATTRIBUTE void __dfso__dl_get_tls_static_info( + size_t *sizep, size_t *alignp, dfsan_label sizep_label, + dfsan_label alignp_label, dfsan_origin sizep_origin, + dfsan_origin alignp_origin) { + __dfsw__dl_get_tls_static_info(sizep, alignp, sizep_label, alignp_label); +} + SANITIZER_INTERFACE_ATTRIBUTE char *__dfsw_ctime_r(const time_t *timep, char *buf, dfsan_label timep_label, dfsan_label buf_label, dfsan_label *ret_label) { diff --git a/compiler-rt/test/dfsan/custom.cpp b/compiler-rt/test/dfsan/custom.cpp index 7825f7aa8f32..1498c104160e 100644 --- a/compiler-rt/test/dfsan/custom.cpp +++ b/compiler-rt/test/dfsan/custom.cpp @@ -217,41 +217,68 @@ void test_bcmp() { ASSERT_ZERO_LABEL(rv); } -#if !defined(ORIGIN_TRACKING) void test_memcpy() { char str1[] = "str1"; char str2[sizeof(str1)]; dfsan_set_label(i_label, &str1[3], 1); - ASSERT_ZERO_LABEL(memcpy(str2, str1, sizeof(str1))); + DEFINE_AND_SAVE_ORIGINS(str1) + + char *ptr2 = str2; + dfsan_set_label(j_label, &ptr2, sizeof(ptr2)); + + void *r = memcpy(ptr2, str1, sizeof(str1)); + ASSERT_LABEL(r, j_label); + ASSERT_EQ_ORIGIN(r, ptr2); assert(0 == memcmp(str2, str1, sizeof(str1))); ASSERT_ZERO_LABEL(str2[0]); ASSERT_LABEL(str2[3], i_label); + + for (int i = 0; i < sizeof(str2); ++i) { + if (!dfsan_get_label(str2[i])) + continue; + ASSERT_INIT_ORIGIN(&(str2[i]), str1_o[i]); + } } void test_memmove() { char str[] = "str1xx"; dfsan_set_label(i_label, &str[3], 1); - ASSERT_ZERO_LABEL(memmove(str + 2, str, 4)); + DEFINE_AND_SAVE_ORIGINS(str) + + char *ptr = str + 2; + dfsan_set_label(j_label, &ptr, sizeof(ptr)); + + void *r = memmove(ptr, str, 4); + ASSERT_LABEL(r, j_label); + ASSERT_EQ_ORIGIN(r, ptr); assert(0 == memcmp(str + 2, "str1", 4)); - for (int i = 0; i <= 4; ++i) - ASSERT_ZERO_LABEL(str[i]); + ASSERT_ZERO_LABEL(str[4]); ASSERT_LABEL(str[5], i_label); + + for (int i = 0; i < 4; ++i) { + if (!dfsan_get_label(ptr[i])) + continue; + ASSERT_INIT_ORIGIN(&(ptr[i]), str_o[i]); + } } void test_memset() { char buf[8]; int j = 'a'; + char *ptr = buf; dfsan_set_label(j_label, &j, sizeof(j)); - - ASSERT_ZERO_LABEL(memset(&buf, j, sizeof(buf))); + dfsan_set_label(k_label, &ptr, sizeof(ptr)); + void *ret = memset(ptr, j, sizeof(buf)); + ASSERT_LABEL(ret, k_label); + ASSERT_EQ_ORIGIN(ret, ptr); for (int i = 0; i < 8; ++i) { ASSERT_LABEL(buf[i], j_label); + ASSERT_EQ_ORIGIN(buf[i], j); assert(buf[i] == 'a'); } } -#endif // !defined(ORIGIN_TRACKING) void test_strcmp() { char str1[] = "str1", str2[] = "str2"; @@ -278,18 +305,34 @@ void test_strcmp() { #endif } -#if !defined(ORIGIN_TRACKING) void test_strcat() { char src[] = "world"; + int volatile x = 0; // buffer to ensure src and dst do not share origins char dst[] = "hello \0 "; + int volatile y = 0; // buffer to ensure dst and p do not share origins char *p = dst; dfsan_set_label(k_label, &p, sizeof(p)); dfsan_set_label(i_label, src, sizeof(src)); dfsan_set_label(j_label, dst, sizeof(dst)); + dfsan_origin dst_o = dfsan_get_origin((long)dst[0]); char *ret = strcat(p, src); ASSERT_LABEL(ret, k_label); + ASSERT_EQ_ORIGIN(ret, p); assert(ret == dst); assert(strcmp(src, dst + 6) == 0); + // Origins are assigned for every 4 contiguous 4-aligned bytes. After + // appending src to dst, origins of src can overwrite origins of dst if their + // application adddresses are within [start_aligned_down, end_aligned_up). + // Other origins are not changed. + char *start_aligned_down = (char *)(((size_t)(dst + 6)) & ~3UL); + char *end_aligned_up = (char *)(((size_t)(dst + 11 + 4)) & ~3UL); + for (int i = 0; i < 12; ++i) { + if (dst + i < start_aligned_down || dst + i >= end_aligned_up) { + ASSERT_INIT_ORIGIN(&dst[i], dst_o); + } else { + ASSERT_INIT_ORIGIN_EQ_ORIGIN(&dst[i], src[0]); + } + } for (int i = 0; i < 6; ++i) { ASSERT_LABEL(dst[i], j_label); } @@ -299,7 +342,6 @@ void test_strcat() { } ASSERT_LABEL(dst[11], j_label); } -#endif // !defined(ORIGIN_TRACKING) void test_strlen() { char str1[] = "str1"; @@ -315,14 +357,22 @@ void test_strlen() { #endif } -#if !defined(ORIGIN_TRACKING) void test_strdup() { char str1[] = "str1"; dfsan_set_label(i_label, &str1[3], 1); + DEFINE_AND_SAVE_ORIGINS(str1) char *strd = strdup(str1); + ASSERT_ZERO_LABEL(strd); ASSERT_ZERO_LABEL(strd[0]); ASSERT_LABEL(strd[3], i_label); + + for (int i = 0; i < strlen(strd); ++i) { + if (!dfsan_get_label(strd[i])) + continue; + ASSERT_INIT_ORIGIN(&(strd[i]), str1_o[i]); + } + free(strd); } @@ -339,16 +389,29 @@ void test_strncpy() { ASSERT_ZERO_LABEL(strd[1]); ASSERT_ZERO_LABEL(strd[2]); ASSERT_LABEL(strd[3], i_label); + ASSERT_INIT_ORIGIN_EQ_ORIGIN(&(strd[3]), str1[3]); - strd = strncpy(str2, str1, 3); + char *p2 = str2; + dfsan_set_label(j_label, &p2, sizeof(p2)); + strd = strncpy(p2, str1, 3); assert(strd == str2); assert(strncmp(str1, str2, 3) == 0); - ASSERT_ZERO_LABEL(strd); + ASSERT_LABEL(strd, j_label); + ASSERT_EQ_ORIGIN(strd, p2); + // When -dfsan-combine-pointer-labels-on-load is on, strd's label propagates + // to strd[i]'s label. When ORIGIN_TRACKING is defined, + // -dfsan-combine-pointer-labels-on-load is always off, otherwise the flag + // is on by default. +#if defined(ORIGIN_TRACKING) ASSERT_ZERO_LABEL(strd[0]); ASSERT_ZERO_LABEL(strd[1]); ASSERT_ZERO_LABEL(strd[2]); +#else + ASSERT_LABEL(strd[0], j_label); + ASSERT_LABEL(strd[1], j_label); + ASSERT_LABEL(strd[2], j_label); +#endif } -#endif // !defined(ORIGIN_TRACKING) void test_strncmp() { char str1[] = "str1", str2[] = "str2"; @@ -523,7 +586,6 @@ void test_strchr() { #endif } -#if !defined(ORIGIN_TRACKING) void test_calloc() { // With any luck this sequence of calls will cause calloc to return the same // pointer both times. This is probably the best we can do to test this @@ -538,6 +600,7 @@ void test_calloc() { free(crv); } +#if !defined(ORIGIN_TRACKING) void test_recvmmsg() { int sockfds[2]; int ret = socketpair(AF_UNIX, SOCK_DGRAM, 0, sockfds); @@ -630,12 +693,14 @@ void test_recvmsg() { close(sockfds[0]); close(sockfds[1]); } +#endif // !defined(ORIGIN_TRACKING) void test_read() { char buf[16]; dfsan_set_label(i_label, buf, 1); dfsan_set_label(j_label, buf + 15, 1); + DEFINE_AND_SAVE_ORIGINS(buf) ASSERT_LABEL(buf[0], i_label); ASSERT_LABEL(buf[15], j_label); @@ -645,6 +710,7 @@ void test_read() { ASSERT_ZERO_LABEL(rv); ASSERT_ZERO_LABEL(buf[0]); ASSERT_ZERO_LABEL(buf[15]); + ASSERT_SAVED_ORIGINS(buf) close(fd); } @@ -653,6 +719,7 @@ void test_pread() { dfsan_set_label(i_label, buf, 1); dfsan_set_label(j_label, buf + 15, 1); + DEFINE_AND_SAVE_ORIGINS(buf) ASSERT_LABEL(buf[0], i_label); ASSERT_LABEL(buf[15], j_label); @@ -662,6 +729,7 @@ void test_pread() { ASSERT_ZERO_LABEL(rv); ASSERT_ZERO_LABEL(buf[0]); ASSERT_ZERO_LABEL(buf[15]); + ASSERT_SAVED_ORIGINS(buf) close(fd); } @@ -678,12 +746,15 @@ void test_dlopen() { void test_clock_gettime() { struct timespec tp; dfsan_set_label(j_label, ((char *)&tp) + 3, 1); + dfsan_origin origin = dfsan_get_origin((long)(((char *)&tp)[3])); int t = clock_gettime(CLOCK_REALTIME, &tp); assert(t == 0); ASSERT_ZERO_LABEL(t); ASSERT_ZERO_LABEL(((char *)&tp)[3]); + ASSERT_ORIGIN(((char *)&tp)[3], origin); } +#if !defined(ORIGIN_TRACKING) void test_ctime_r() { char *buf = (char*) malloc(64); time_t t = 0; @@ -704,6 +775,7 @@ void test_ctime_r() { ASSERT_LABEL(ret, j_label); ASSERT_READ_ZERO_LABEL(buf, strlen(buf) + 1); } +#endif // !defined(ORIGIN_TRACKING) static int write_callback_count = 0; static int last_fd; @@ -728,6 +800,8 @@ void test_dfsan_set_write_callback() { write_callback_count = 0; + DEFINE_AND_SAVE_ORIGINS(buf) + // Callback should be invoked on every call to write(). int res = write(fd, buf, buf_len); assert(write_callback_count == 1); @@ -736,12 +810,21 @@ void test_dfsan_set_write_callback() { ASSERT_READ_ZERO_LABEL(last_buf, sizeof(last_buf)); ASSERT_READ_ZERO_LABEL(&last_count, sizeof(last_count)); + for (int i = 0; i < buf_len; ++i) + ASSERT_ORIGIN(last_buf[i], buf_o[i]); + + ASSERT_ZERO_ORIGINS(&last_count, sizeof(last_count)); + // Add a label to write() arguments. Check that the labels are readable from // the values passed to the callback. dfsan_set_label(i_label, &fd, sizeof(fd)); dfsan_set_label(j_label, &(buf[3]), 1); dfsan_set_label(k_label, &buf_len, sizeof(buf_len)); + dfsan_origin fd_o = dfsan_get_origin((long)fd); + dfsan_origin buf3_o = dfsan_get_origin((long)(buf[3])); + dfsan_origin buf_len_o = dfsan_get_origin((long)buf_len); + res = write(fd, buf, buf_len); assert(write_callback_count == 2); ASSERT_READ_ZERO_LABEL(&res, sizeof(res)); @@ -749,10 +832,27 @@ void test_dfsan_set_write_callback() { ASSERT_READ_LABEL(&last_buf[3], sizeof(last_buf[3]), j_label); ASSERT_READ_LABEL(last_buf, sizeof(last_buf), j_label); ASSERT_READ_LABEL(&last_count, sizeof(last_count), k_label); + ASSERT_ZERO_ORIGINS(&res, sizeof(res)); + ASSERT_INIT_ORIGINS(&last_fd, sizeof(last_fd), fd_o); + ASSERT_INIT_ORIGINS(&last_buf[3], sizeof(last_buf[3]), buf3_o); + + // Origins are assigned for every 4 contiguous 4-aligned bytes. After + // appending src to dst, origins of src can overwrite origins of dst if their + // application adddresses are within an aligned range. Other origins are not + // changed. + for (int i = 0; i < buf_len; ++i) { + size_t i_addr = size_t(&last_buf[i]); + if (((size_t(&last_buf[3]) & ~3UL) > i_addr) || + (((size_t(&last_buf[3]) + 4) & ~3UL) <= i_addr)) + ASSERT_ORIGIN(last_buf[i], buf_o[i]); + } + + ASSERT_INIT_ORIGINS(&last_count, sizeof(last_count), buf_len_o); dfsan_set_write_callback(NULL); } +#if !defined(ORIGIN_TRACKING) void test_fgets() { char *buf = (char*) malloc(128); FILE *f = fopen("/etc/passwd", "r"); @@ -1126,7 +1226,6 @@ void test_pthread_create() { // check-wrappers script. void test_pthread_join() {} -#if !defined(ORIGIN_TRACKING) int dl_iterate_phdr_test_cb(struct dl_phdr_info *info, size_t size, void *data) { assert(data == (void *)3); @@ -1151,11 +1250,16 @@ void test__dl_get_tls_static_info() { size_t sizep = 0, alignp = 0; dfsan_set_label(i_label, &sizep, sizeof(sizep)); dfsan_set_label(i_label, &alignp, sizeof(alignp)); + dfsan_origin sizep_o = dfsan_get_origin(sizep); + dfsan_origin alignp_o = dfsan_get_origin(alignp); _dl_get_tls_static_info(&sizep, &alignp); ASSERT_ZERO_LABEL(sizep); ASSERT_ZERO_LABEL(alignp); + ASSERT_ORIGIN(sizep, sizep_o); + ASSERT_ORIGIN(alignp, alignp_o); } +#if !defined(ORIGIN_TRACKING) void test_strrchr() { char str1[] = "str1str1"; dfsan_set_label(i_label, &str1[7], 1); @@ -1559,17 +1663,17 @@ int main(void) { assert(i_j_label != j_label); assert(i_j_label != k_label); -#if !defined(ORIGIN_TRACKING) test__dl_get_tls_static_info(); -#endif // !defined(ORIGIN_TRACKING) test_bcmp(); -#if !defined(ORIGIN_TRACKING) test_calloc(); test_clock_gettime(); +#if !defined(ORIGIN_TRACKING) test_ctime_r(); +#endif // !defined(ORIGIN_TRACKING) test_dfsan_set_write_callback(); test_dl_iterate_phdr(); test_dlopen(); +#if !defined(ORIGIN_TRACKING) test_epoll_wait(); test_fgets(); #endif // !defined(ORIGIN_TRACKING) @@ -1591,18 +1695,18 @@ int main(void) { test_memchr(); #endif // !defined(ORIGIN_TRACKING) test_memcmp(); -#if !defined(ORIGIN_TRACKING) test_memcpy(); test_memmove(); test_memset(); +#if !defined(ORIGIN_TRACKING) test_nanosleep(); test_poll(); - test_pread(); #endif // !defined(ORIGIN_TRACKING) + test_pread(); test_pthread_create(); test_pthread_join(); -#if !defined(ORIGIN_TRACKING) test_read(); +#if !defined(ORIGIN_TRACKING) test_recvmmsg(); test_recvmsg(); test_sched_getaffinity(); @@ -1621,17 +1725,15 @@ int main(void) { test_strcasecmp(); test_strchr(); test_strcmp(); -#if !defined(ORIGIN_TRACKING) test_strcat(); +#if !defined(ORIGIN_TRACKING) test_strcpy(); - test_strdup(); #endif // !defined(ORIGIN_TRACKING) + test_strdup(); test_strlen(); test_strncasecmp(); test_strncmp(); -#if !defined(ORIGIN_TRACKING) test_strncpy(); -#endif // !defined(ORIGIN_TRACKING) test_strpbrk(); #if !defined(ORIGIN_TRACKING) test_strrchr(); -- GitLab From 5b2d8503d1d4b925e30fd2b91f97bfd625f03157 Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Fri, 19 Mar 2021 16:21:15 +0000 Subject: [PATCH 0157/1000] [mlir][Linalg] NFC - Expose helper function `substituteMin`. --- .../Dialect/Linalg/Transforms/Transforms.h | 24 +++++++ .../Dialect/Linalg/Transforms/Transforms.cpp | 63 ++++++++++++++----- 2 files changed, 71 insertions(+), 16 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h index 54a4aec9f867..6d428384080b 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -893,6 +893,30 @@ struct AffineMinSCFCanonicalizationPattern PatternRewriter &rewriter) const override; }; + /// Helper struct to return the results of `substituteMin`. +struct AffineMapAndOperands { + AffineMap map; + SmallVector dims; + SmallVector symbols; +}; +/// Traverse the dims of the AffineMap of `affineMinOp` and substitute scf loop +/// induction variables by new expressions involving the lower or upper bound: +/// - If the AffineDimExpr mapped to a loop IV has a positive sign, it is +/// replaced by the loop upper bound. +/// - If the AffineDimExpr mapped to a loop IV has a negative sign, it is +/// replaced by the loop lower bound. +/// All loop induction variables are iteratively replaced, unless a +/// `substituteOperation` hook is passed to more finely determine which +/// operations are substituted. +/// This is used as an intermediate step in computing bounding boxes and +/// canonicalize AffineMinOps. All dim and symbol operands are assumed to have +/// positive values (positive orthant assumptions). +/// Return a new AffineMap, dims and symbols that have been canonicalized and +/// simplified. +AffineMapAndOperands substituteMin( + AffineMinOp affineMinOp, + llvm::function_ref substituteOperation = nullptr); + /// Converts Convolution op into vector contraction. /// /// Conversion expects ConvOp to have dimensions marked in the *mask* as diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index c2e52c63eabd..fef6dd8f996f 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -536,8 +536,10 @@ static AffineExpr substituteLoopInExpr(AffineExpr expr, AffineExpr dimExpr, /// Traverse the `dims` and substitute known min or max expressions in place of /// induction variables in `exprs`. -static AffineMap substitute(AffineMap map, SmallVectorImpl &dims, - SmallVectorImpl &symbols) { +static AffineMap substitute( + AffineMap map, SmallVectorImpl &dims, + SmallVectorImpl &symbols, + llvm::function_ref substituteOperation = nullptr) { auto exprs = llvm::to_vector<4>(map.getResults()); for (AffineExpr &expr : exprs) { bool substituted = true; @@ -549,17 +551,19 @@ static AffineMap substitute(AffineMap map, SmallVectorImpl &dims, LLVM_DEBUG(DBGS() << "Subst: " << dim << " @ " << dimExpr << "\n"); AffineExpr substitutedExpr; if (auto forOp = scf::getForInductionVarOwner(dim)) - substitutedExpr = substituteLoopInExpr( - expr, dimExpr, forOp.lowerBound(), forOp.upperBound(), - forOp.step(), dims, symbols); + if (!substituteOperation || substituteOperation(forOp)) + substitutedExpr = substituteLoopInExpr( + expr, dimExpr, forOp.lowerBound(), forOp.upperBound(), + forOp.step(), dims, symbols); if (auto parallelForOp = scf::getParallelForInductionVarOwner(dim)) - for (unsigned idx = 0, e = parallelForOp.getNumLoops(); idx < e; - ++idx) - substitutedExpr = substituteLoopInExpr( - expr, dimExpr, parallelForOp.lowerBound()[idx], - parallelForOp.upperBound()[idx], parallelForOp.step()[idx], - dims, symbols); + if (!substituteOperation || substituteOperation(parallelForOp)) + for (unsigned idx = 0, e = parallelForOp.getNumLoops(); idx < e; + ++idx) + substitutedExpr = substituteLoopInExpr( + expr, dimExpr, parallelForOp.lowerBound()[idx], + parallelForOp.upperBound()[idx], parallelForOp.step()[idx], + dims, symbols); if (!substitutedExpr) continue; @@ -578,6 +582,9 @@ static AffineMap substitute(AffineMap map, SmallVectorImpl &dims, exprs.front().getContext()); LLVM_DEBUG(DBGS() << "Map to simplify: " << map << "\n"); + LLVM_DEBUG(DBGS() << "Operands:\n"); + for (Value v : operands) + LLVM_DEBUG(DBGS() << v << "\n"); // Pull in affine.apply operations and compose them fully into the // result. @@ -596,14 +603,38 @@ static AffineMap substitute(AffineMap map, SmallVectorImpl &dims, return AffineMap::get(dims.size(), symbols.size(), exprs, map.getContext()); } +/// Traverse the dims of the AffineMap of `affineMinOp` and substitute scf loop +/// induction variables by new expressions involving the lower or upper bound: +/// - If the AffineDimExpr mapped to a loop IV has a positive sign, it is +/// replaced by the loop upper bound. +/// - If the AffineDimExpr mapped to a loop IV has a negative sign, it is +/// replaced by the loop lower bound. +/// All loop induction variables are iteratively replaced, unless a +/// `substituteOperation` hook is passed to more finely determine which +/// operations are substituted. +/// This is used as an intermediate step in computing bounding boxes and +/// canonicalize AffineMinOps. All dim and symbol operands are assumed to have +/// positive values (positive orthant assumptions). +/// Return a new AffineMap, dims and symbols that have been canonicalized and +/// simplified. +AffineMapAndOperands mlir::linalg::substituteMin( + AffineMinOp affineMinOp, + llvm::function_ref substituteOperation) { + AffineMapAndOperands res{affineMinOp.getAffineMap(), + SmallVector(affineMinOp.getDimOperands()), + SmallVector(affineMinOp.getSymbolOperands())}; + res.map = substitute(affineMinOp.getAffineMap(), res.dims, res.symbols, + substituteOperation); + return res; +} + LogicalResult AffineMinSCFCanonicalizationPattern::matchAndRewrite( AffineMinOp minOp, PatternRewriter &rewriter) const { LLVM_DEBUG(DBGS() << "Canonicalize AffineMinSCF: " << *minOp.getOperation() << "\n"); - SmallVector dims(minOp.getDimOperands()), - symbols(minOp.getSymbolOperands()); - AffineMap map = substitute(minOp.getAffineMap(), dims, symbols); + auto affineMapAndOperands = substituteMin(minOp); + AffineMap map = affineMapAndOperands.map; LLVM_DEBUG(DBGS() << "Resulting map: " << map << "\n"); @@ -638,8 +669,8 @@ LogicalResult AffineMinSCFCanonicalizationPattern::matchAndRewrite( rewriter.replaceOpWithNewOp(minOp, cst.getValue()); } else { auto resultMap = AffineMap::get(0, map.getNumSymbols(), {e}, ctx); - SmallVector resultOperands = dims; - resultOperands.append(symbols.begin(), symbols.end()); + SmallVector resultOperands = affineMapAndOperands.dims; + llvm::append_range(resultOperands, affineMapAndOperands.symbols); canonicalizeMapAndOperands(&resultMap, &resultOperands); resultMap = simplifyAffineMap(resultMap); rewriter.replaceOpWithNewOp(minOp, resultMap, -- GitLab From 3aa6a4cb39c4032983bbc0aaeda646ebdd3ebefa Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 17 Mar 2021 11:09:41 +0000 Subject: [PATCH 0158/1000] [libcxx][Arm] Move buildbot flags into cmake files Reviewed By: #libc, Mordante, curdeius Differential Revision: https://reviews.llvm.org/D98771 --- .../caches/{Armv7.cmake => Armv7Arm.cmake} | 2 ++ .../caches/Armv7Thumb-noexceptions.cmake | 6 ++++++ .../caches/{Armv8.cmake => Armv8Arm.cmake} | 2 ++ .../caches/Armv8Thumb-noexceptions.cmake | 6 ++++++ libcxx/utils/ci/run-buildbot | 20 ++++--------------- 5 files changed, 20 insertions(+), 16 deletions(-) rename libcxx/cmake/caches/{Armv7.cmake => Armv7Arm.cmake} (56%) create mode 100644 libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake rename libcxx/cmake/caches/{Armv8.cmake => Armv8Arm.cmake} (56%) create mode 100644 libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake diff --git a/libcxx/cmake/caches/Armv7.cmake b/libcxx/cmake/caches/Armv7Arm.cmake similarity index 56% rename from libcxx/cmake/caches/Armv7.cmake rename to libcxx/cmake/caches/Armv7Arm.cmake index 34b90083bd7d..8b2b54eba13c 100644 --- a/libcxx/cmake/caches/Armv7.cmake +++ b/libcxx/cmake/caches/Armv7Arm.cmake @@ -1,2 +1,4 @@ set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "") set(LIBCXX_TARGET_TRIPLE "armv7-linux-gnueabihf" CACHE STRING "") +set(CMAKE_CXX_FLAGS "-marm" CACHE STRING "") +set(CMAKE_C_FLAGS "-marm" CACHE STRING "") diff --git a/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake b/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake new file mode 100644 index 000000000000..67ec43b93f20 --- /dev/null +++ b/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake @@ -0,0 +1,6 @@ +set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "") +set(LIBCXX_TARGET_TRIPLE "armv7-linux-gnueabihf" CACHE STRING "") +set(CMAKE_CXX_FLAGS "-mthumb" CACHE STRING "") +set(CMAKE_C_FLAGS "-mthumb" CACHE STRING "") +set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "") +set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "") diff --git a/libcxx/cmake/caches/Armv8.cmake b/libcxx/cmake/caches/Armv8Arm.cmake similarity index 56% rename from libcxx/cmake/caches/Armv8.cmake rename to libcxx/cmake/caches/Armv8Arm.cmake index 85da66cbea54..55dfa908b3d0 100644 --- a/libcxx/cmake/caches/Armv8.cmake +++ b/libcxx/cmake/caches/Armv8Arm.cmake @@ -1,2 +1,4 @@ set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "") set(LIBCXX_TARGET_TRIPLE "armv8-linux-gnueabihf" CACHE STRING "") +set(CMAKE_CXX_FLAGS "-marm" CACHE STRING "") +set(CMAKE_C_FLAGS "-marm" CACHE STRING "") diff --git a/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake b/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake new file mode 100644 index 000000000000..fb1d10efaddc --- /dev/null +++ b/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake @@ -0,0 +1,6 @@ +set(LIBCXXABI_USE_LLVM_UNWINDER ON CACHE BOOL "") +set(LIBCXX_TARGET_TRIPLE "armv8-linux-gnueabihf" CACHE STRING "") +set(CMAKE_CXX_FLAGS "-mthumb" CACHE STRING "") +set(CMAKE_C_FLAGS "-mthumb" CACHE STRING "") +set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "") +set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "") diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot index f76a669f9f1d..04f6cf3fc375 100755 --- a/libcxx/utils/ci/run-buildbot +++ b/libcxx/utils/ci/run-buildbot @@ -402,35 +402,23 @@ aarch64-noexceptions) # Aka Armv8 32 bit armv8) clean - generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Armv8.cmake" \ - -DCMAKE_CXX_FLAGS="-marm" \ - -DCMAKE_C_FLAGS="-marm" + generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Armv8Arm.cmake" check-cxx-cxxabi ;; armv8-noexceptions) clean - generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Armv8.cmake" \ - -DCMAKE_CXX_FLAGS="-mthumb" \ - -DCMAKE_C_FLAGS="-mthumb" \ - -DLIBCXX_ENABLE_EXCEPTIONS=OFF \ - -DLIBCXXABI_ENABLE_EXCEPTIONS=OFF + generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Armv8Thumb-noexceptions.cmake" check-cxx-cxxabi ;; # Armv7 32 bit. One building Arm only one Thumb only code. armv7) clean - generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Armv7.cmake" \ - -DCMAKE_CXX_FLAGS="-marm" \ - -DCMAKE_C_FLAGS="-marm" + generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Armv7Arm.cmake" check-cxx-cxxabi ;; armv7-noexceptions) clean - generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Armv7.cmake" \ - -DCMAKE_CXX_FLAGS="-mthumb" \ - -DCMAKE_C_FLAGS="-mthumb" \ - -DLIBCXX_ENABLE_EXCEPTIONS=OFF \ - -DLIBCXXABI_ENABLE_EXCEPTIONS=OFF + generate-cmake -C "${MONOREPO_ROOT}/libcxx/cmake/caches/Armv7Thumb-noexceptions.cmake" check-cxx-cxxabi ;; *) -- GitLab From aafc3f7be804d117a632365489a18c3e484a3931 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Markus=20B=C3=B6ck?= Date: Fri, 19 Mar 2021 17:47:07 +0100 Subject: [PATCH 0159/1000] [Driver] Add -print-runtime-dir This patch adds a new command line option to clang which outputs the directory containing clangs runtime libraries to stdout. The primary use case for this command line flag is for build systems using clang-cl. Build systems when using clang-cl invoke the linker, that is either link or lld-link in this case, directly instead of invoking the compiler for the linking process as is common with the other drivers. This leads to issues when runtime libraries of clang, such as sanitizers or profiling, have to be linked in as the compiler cannot communicate the link directory to the linker. Using this flag, build systems would be capable of getting the directory containing all of clang's runtime libraries and add it to the linker path. Differential Revision: https://reviews.llvm.org/D98868 --- clang/include/clang/Driver/Options.td | 2 ++ clang/lib/Driver/Driver.cpp | 9 +++++++++ .../lib/windows/clang_rt.builtins-x86_64.lib | 0 .../lib/x86_64-pc-windows-msvc/clang_rt.builtins.lib | 0 clang/test/Driver/immediate-options.c | 12 ++++++++++++ 5 files changed, 23 insertions(+) create mode 100644 clang/test/Driver/Inputs/resource_dir/lib/windows/clang_rt.builtins-x86_64.lib create mode 100644 clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/x86_64-pc-windows-msvc/clang_rt.builtins.lib diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index a9b43a8fe620..b7efb7469a23 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3566,6 +3566,8 @@ def print_targets : Flag<["-", "--"], "print-targets">, HelpText<"Print the registered targets">; def print_rocm_search_dirs : Flag<["-", "--"], "print-rocm-search-dirs">, HelpText<"Print the paths used for finding ROCm installation">; +def print_runtime_dir : Flag<["-", "--"], "print-runtime-dir">, + HelpText<"Print the directory pathname containing clangs runtime libraries">; def private__bundle : Flag<["-"], "private_bundle">; def pthreads : Flag<["-"], "pthreads">; defm pthread : BoolOption<"", "pthread", diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index dbd365e7c9bc..e70263e6a295 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -1824,6 +1824,15 @@ bool Driver::HandleImmediateArgs(const Compilation &C) { return false; } + if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) { + if (auto RuntimePath = TC.getRuntimePath()) { + llvm::outs() << *RuntimePath << '\n'; + return false; + } + llvm::outs() << TC.getCompilerRTPath() << '\n'; + return false; + } + // FIXME: The following handlers should use a callback mechanism, we don't // know what the client would like to do. if (Arg *A = C.getArgs().getLastArg(options::OPT_print_file_name_EQ)) { diff --git a/clang/test/Driver/Inputs/resource_dir/lib/windows/clang_rt.builtins-x86_64.lib b/clang/test/Driver/Inputs/resource_dir/lib/windows/clang_rt.builtins-x86_64.lib new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/x86_64-pc-windows-msvc/clang_rt.builtins.lib b/clang/test/Driver/Inputs/resource_dir_with_per_target_subdir/lib/x86_64-pc-windows-msvc/clang_rt.builtins.lib new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/clang/test/Driver/immediate-options.c b/clang/test/Driver/immediate-options.c index 723a6fa302f8..c398e0d41c6e 100644 --- a/clang/test/Driver/immediate-options.c +++ b/clang/test/Driver/immediate-options.c @@ -17,3 +17,15 @@ // Allow unspecified output because the value of CLANG_RESOURCE_DIR is unknown. // RUN: %clang -print-resource-dir | FileCheck %s -check-prefix=PRINT-RESOURCE-DIR // PRINT-RESOURCE-DIR: {{.+}} + +// Default resource-dir layout +// RUN: %clang -print-runtime-dir --target=x86_64-pc-windows-msvc \ +// RUN: -resource-dir=%S/Inputs/resource_dir \ +// RUN: | FileCheck --check-prefix=PRINT-RUNTIME-DIR %s +// PRINT-RUNTIME-DIR: lib{{/|\\}}windows + +// Per target dir layout +// RUN: %clang -print-runtime-dir --target=x86_64-pc-windows-msvc \ +// RUN: -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \ +// RUN: | FileCheck --check-prefix=PRINT-RUNTIME-DIR-PER-TARGET %s +// PRINT-RUNTIME-DIR-PER-TARGET: lib{{/|\\}}x86_64-pc-windows-msvc -- GitLab From 5df52f7708566975975a8912abd2fa41dfa3333f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 19 Mar 2021 16:08:10 +0000 Subject: [PATCH 0160/1000] [AMDGPU] Remove weird target triples from tests. NFC. --- .../test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll | 10 +++++----- .../AMDGPU/atomic_optimizations_global_pointer.ll | 10 +++++----- .../AMDGPU/atomic_optimizations_local_pointer.ll | 10 +++++----- .../CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll | 10 +++++----- .../AMDGPU/atomic_optimizations_struct_buffer.ll | 10 +++++----- llvm/test/CodeGen/AMDGPU/spill-before-exec.mir | 2 +- 6 files changed, 26 insertions(+), 26 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index bccb3f68dcee..cdd4db7f8dbc 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32 immarg) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 7db3e8a9ae8b..40520f6e8d29 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN64 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN32 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN64 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN32 %s declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index eadcb2a1eca2..bb56be5f12a4 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index bd0ec2efec2d..a73bf61340e0 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.raw.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index 3a29c101babd..43f52bdf192b 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s -; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX89,DPPCOMB %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN64,GFX8MORE,GFX8MORE64,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN32,GFX8MORE,GFX8MORE32,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() declare i32 @llvm.amdgcn.struct.buffer.atomic.add(i32, <4 x i32>, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir index 76e7d73cdf6c..fe5b4eb45046 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir @@ -1,5 +1,5 @@ # REQUIRES: asserts -# RUN: llc -mtriple=amdgcn--- -verify-machineinstrs -debug-only=regalloc -run-pass=greedy -o /dev/null %s 2>&1 | FileCheck %s +# RUN: llc -mtriple=amdgcn-- -verify-machineinstrs -debug-only=regalloc -run-pass=greedy -o /dev/null %s 2>&1 | FileCheck %s --- # Check that physreg candidate is not used since cannot be spilled in a block, -- GitLab From 87248e852b71396194e4bb4a893633a8c47ac1e0 Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Fri, 19 Mar 2021 16:43:52 +0000 Subject: [PATCH 0161/1000] [AMDGPU] Rationalize some check prefixes and use more common prefixes. NFC. --- .../atomic_optimizations_global_pointer.ll | 1740 ++++++++--------- .../atomic_optimizations_local_pointer.ll | 90 +- .../atomic_optimizations_pixelshader.ll | 104 +- 3 files changed, 935 insertions(+), 999 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 40520f6e8d29..9e06bac33630 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -2,8 +2,8 @@ ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN64 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GCN32 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -76,76 +76,76 @@ define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspac ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; -; GCN64-LABEL: add_i32_constant: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN64-NEXT: s_mov_b64 s[6:7], exec -; GCN64-NEXT: ; implicit-def: $vgpr1 -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN64-NEXT: s_cbranch_execz BB0_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_mul_i32 s6, s6, 5 -; GCN64-NEXT: s_mov_b32 s10, -1 -; GCN64-NEXT: v_mov_b32_e32 v1, s6 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mov_b32 s8, s2 -; GCN64-NEXT: s_mov_b32 s9, s3 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB0_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_readfirstlane_b32 s2, v1 -; GCN64-NEXT: s_mov_b32 s3, 0x31016000 -; GCN64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GCN64-NEXT: s_mov_b32 s2, -1 -; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: add_i32_constant: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz BB0_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB0_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: add_i32_constant: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN32-NEXT: s_mov_b32 s5, exec_lo -; GCN32-NEXT: ; implicit-def: $vgpr1 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB0_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 -; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_mul_i32 s5, s5, 5 -; GCN32-NEXT: s_mov_b32 s10, -1 -; GCN32-NEXT: v_mov_b32_e32 v1, s5 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mov_b32 s8, s2 -; GCN32-NEXT: s_mov_b32 s9, s3 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB0_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_readfirstlane_b32 s2, v1 -; GCN32-NEXT: s_mov_b32 s3, 0x31016000 -; GCN32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 -; GCN32-NEXT: s_mov_b32 s2, -1 -; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: add_i32_constant: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB0_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB0_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm entry: %old = atomicrmw add i32 addrspace(1)* %inout, i32 5 acq_rel store i32 %old, i32 addrspace(1)* %out @@ -258,82 +258,82 @@ define amdgpu_kernel void @add_i32_uniform(i32 addrspace(1)* %out, i32 addrspace ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; -; GCN64-LABEL: add_i32_uniform: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: s_clause 0x1 -; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN64-NEXT: s_load_dword s2, s[0:1], 0x34 -; GCN64-NEXT: s_mov_b64 s[8:9], exec -; GCN64-NEXT: ; implicit-def: $vgpr1 -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN64-NEXT: s_cbranch_execz BB1_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mul_i32 s3, s2, s3 -; GCN64-NEXT: s_mov_b32 s10, -1 -; GCN64-NEXT: v_mov_b32_e32 v1, s3 -; GCN64-NEXT: s_mov_b32 s8, s6 -; GCN64-NEXT: s_mov_b32 s9, s7 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB1_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN64-NEXT: v_readfirstlane_b32 s0, v1 -; GCN64-NEXT: s_mov_b32 s7, 0x31016000 -; GCN64-NEXT: s_mov_b32 s6, -1 -; GCN64-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GCN64-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: add_i32_uniform: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz BB1_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: s_mov_b32 s8, s6 +; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB1_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: add_i32_uniform: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: s_clause 0x1 -; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN32-NEXT: s_load_dword s2, s[0:1], 0x34 -; GCN32-NEXT: s_mov_b32 s3, exec_lo -; GCN32-NEXT: ; implicit-def: $vgpr1 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB1_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: s_bcnt1_i32_b32 s1, s3 -; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mul_i32 s1, s2, s1 -; GCN32-NEXT: s_mov_b32 s10, -1 -; GCN32-NEXT: v_mov_b32_e32 v1, s1 -; GCN32-NEXT: s_mov_b32 s8, s6 -; GCN32-NEXT: s_mov_b32 s9, s7 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB1_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN32-NEXT: v_readfirstlane_b32 s0, v1 -; GCN32-NEXT: s_mov_b32 s7, 0x31016000 -; GCN32-NEXT: s_mov_b32 s6, -1 -; GCN32-NEXT: v_add_nc_u32_e32 v0, s0, v0 -; GCN32-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: add_i32_uniform: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB1_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: s_mov_b32 s8, s6 +; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB1_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: s_endpgm entry: %old = atomicrmw add i32 addrspace(1)* %inout, i32 %additive acq_rel store i32 %old, i32 addrspace(1)* %out @@ -468,127 +468,127 @@ define amdgpu_kernel void @add_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GCN64-LABEL: add_i32_varying: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: v_mov_b32_e32 v1, v0 -; GCN64-NEXT: s_not_b64 exec, exec -; GCN64-NEXT: v_mov_b32_e32 v1, 0 -; GCN64-NEXT: s_not_b64 exec, exec -; GCN64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN64-NEXT: v_mov_b32_e32 v3, 0 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN64-NEXT: v_mov_b32_e32 v2, v1 -; GCN64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GCN64-NEXT: v_readlane_b32 s4, v1, 31 -; GCN64-NEXT: v_mov_b32_e32 v2, s4 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GCN64-NEXT: v_readlane_b32 s6, v1, 15 -; GCN64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GCN64-NEXT: s_mov_b64 exec, s[2:3] -; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN64-NEXT: v_readlane_b32 s7, v1, 31 -; GCN64-NEXT: v_writelane_b32 v3, s6, 16 -; GCN64-NEXT: s_mov_b64 exec, s[4:5] -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN64-NEXT: v_readlane_b32 s8, v1, 47 -; GCN64-NEXT: v_readlane_b32 s9, v1, 63 -; GCN64-NEXT: v_writelane_b32 v3, s7, 32 -; GCN64-NEXT: s_mov_b64 exec, s[4:5] -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 -; GCN64-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN64-NEXT: s_mov_b32 s4, s9 -; GCN64-NEXT: v_writelane_b32 v3, s8, 48 -; GCN64-NEXT: s_mov_b64 exec, s[6:7] -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_mov_b32 s6, -1 -; GCN64-NEXT: ; implicit-def: $vgpr0 -; GCN64-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN64-NEXT: s_cbranch_execz BB2_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: v_mov_b32_e32 v0, s4 -; GCN64-NEXT: s_mov_b32 s7, 0x31016000 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mov_b32 s4, s2 -; GCN64-NEXT: s_mov_b32 s5, s3 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB2_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_readfirstlane_b32 s2, v0 -; GCN64-NEXT: v_mov_b32_e32 v0, v3 -; GCN64-NEXT: s_mov_b32 s3, 0x31016000 -; GCN64-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GCN64-NEXT: s_mov_b32 s2, s6 -; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: add_i32_varying: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064-NEXT: s_mov_b32 s4, s9 +; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064-NEXT: s_cbranch_execz BB2_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s4, s2 +; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB2_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: add_i32_varying: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: v_mov_b32_e32 v1, v0 -; GCN32-NEXT: s_not_b32 exec_lo, exec_lo -; GCN32-NEXT: v_mov_b32_e32 v1, 0 -; GCN32-NEXT: s_not_b32 exec_lo, exec_lo -; GCN32-NEXT: s_or_saveexec_b32 s2, -1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN32-NEXT: v_mov_b32_e32 v2, v1 -; GCN32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GCN32-NEXT: s_mov_b32 exec_lo, s2 -; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN32-NEXT: s_or_saveexec_b32 s4, -1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GCN32-NEXT: v_mov_b32_e32 v3, 0 -; GCN32-NEXT: v_readlane_b32 s5, v1, 15 -; GCN32-NEXT: v_readlane_b32 s6, v1, 31 -; GCN32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GCN32-NEXT: s_mov_b32 exec_lo, s4 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GCN32-NEXT: s_or_saveexec_b32 s4, -1 -; GCN32-NEXT: v_writelane_b32 v3, s5, 16 -; GCN32-NEXT: s_mov_b32 exec_lo, s4 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_mov_b32 s4, s6 -; GCN32-NEXT: s_mov_b32 s6, -1 -; GCN32-NEXT: ; implicit-def: $vgpr0 -; GCN32-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB2_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: v_mov_b32_e32 v0, s4 -; GCN32-NEXT: s_mov_b32 s7, 0x31016000 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mov_b32 s4, s2 -; GCN32-NEXT: s_mov_b32 s5, s3 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB2_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_readfirstlane_b32 s2, v0 -; GCN32-NEXT: v_mov_b32_e32 v0, v3 -; GCN32-NEXT: s_mov_b32 s3, 0x31016000 -; GCN32-NEXT: v_add_nc_u32_e32 v0, s2, v0 -; GCN32-NEXT: s_mov_b32 s2, s6 -; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: add_i32_varying: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s4, s6 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB2_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s4, s2 +; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB2_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_add_nc_u32_e32 v0, s2, v0 +; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw add i32 addrspace(1)* %inout, i32 %lane acq_rel @@ -675,80 +675,80 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GFX89-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; -; GCN64-LABEL: add_i64_constant: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN64-NEXT: s_mov_b64 s[6:7], exec -; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN64-NEXT: s_cbranch_execz BB3_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN64-NEXT: v_mov_b32_e32 v2, 0 -; GCN64-NEXT: s_mul_i32 s6, s6, 5 -; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: v_mov_b32_e32 v1, s6 -; GCN64-NEXT: s_mov_b32 s10, -1 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mov_b32 s8, s2 -; GCN64-NEXT: s_mov_b32 s9, s3 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB3_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_readfirstlane_b32 s2, v1 -; GCN64-NEXT: v_readfirstlane_b32 s3, v2 -; GCN64-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] -; GCN64-NEXT: s_mov_b32 s3, 0x31016000 -; GCN64-NEXT: s_mov_b32 s2, -1 -; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: add_i64_constant: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz BB3_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB3_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: add_i64_constant: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN32-NEXT: s_mov_b32 s5, exec_lo -; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB3_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 -; GCN32-NEXT: v_mov_b32_e32 v2, 0 -; GCN32-NEXT: s_mul_i32 s5, s5, 5 -; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: v_mov_b32_e32 v1, s5 -; GCN32-NEXT: s_mov_b32 s10, -1 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mov_b32 s8, s2 -; GCN32-NEXT: s_mov_b32 s9, s3 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB3_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_readfirstlane_b32 s2, v1 -; GCN32-NEXT: v_readfirstlane_b32 s3, v2 -; GCN32-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] -; GCN32-NEXT: s_mov_b32 s3, 0x31016000 -; GCN32-NEXT: s_mov_b32 s2, -1 -; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: add_i64_constant: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB3_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB3_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm entry: %old = atomicrmw add i64 addrspace(1)* %inout, i64 5 acq_rel store i64 %old, i64 addrspace(1)* %out @@ -892,100 +892,100 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; -; GCN64-LABEL: add_i64_uniform: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: s_clause 0x1 -; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN64-NEXT: s_mov_b64 s[8:9], exec -; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN64-NEXT: s_cbranch_execz BB4_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mul_i32 s9, s3, s8 -; GCN64-NEXT: s_mul_hi_u32 s10, s2, s8 -; GCN64-NEXT: s_mul_i32 s8, s2, s8 -; GCN64-NEXT: s_add_i32 s10, s10, s9 -; GCN64-NEXT: v_mov_b32_e32 v1, s8 -; GCN64-NEXT: v_mov_b32_e32 v2, s10 -; GCN64-NEXT: s_mov_b32 s10, -1 -; GCN64-NEXT: s_mov_b32 s8, s6 -; GCN64-NEXT: s_mov_b32 s9, s7 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB4_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_mul_lo_u32 v3, s3, v0 -; GCN64-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN64-NEXT: v_readfirstlane_b32 s0, v1 -; GCN64-NEXT: v_readfirstlane_b32 s1, v2 -; GCN64-NEXT: s_mov_b32 s7, 0x31016000 -; GCN64-NEXT: s_mov_b32 s6, -1 -; GCN64-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GCN64-NEXT: v_add_co_u32_e64 v0, vcc, s0, v0 -; GCN64-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc -; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: add_i64_uniform: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz BB4_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mul_i32 s9, s3, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX1064-NEXT: s_mul_i32 s8, s2, s8 +; GFX1064-NEXT: s_add_i32 s10, s10, s9 +; GFX1064-NEXT: v_mov_b32_e32 v1, s8 +; GFX1064-NEXT: v_mov_b32_e32 v2, s10 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s8, s6 +; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB4_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 +; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 +; GFX1064-NEXT: v_add_co_u32_e64 v0, vcc, s0, v0 +; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: add_i64_uniform: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: s_clause 0x1 -; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN32-NEXT: s_mov_b32 s8, exec_lo -; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB4_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: s_bcnt1_i32_b32 s1, s8 -; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mul_i32 s8, s3, s1 -; GCN32-NEXT: s_mul_hi_u32 s9, s2, s1 -; GCN32-NEXT: s_mul_i32 s1, s2, s1 -; GCN32-NEXT: s_add_i32 s9, s9, s8 -; GCN32-NEXT: v_mov_b32_e32 v1, s1 -; GCN32-NEXT: v_mov_b32_e32 v2, s9 -; GCN32-NEXT: s_mov_b32 s10, -1 -; GCN32-NEXT: s_mov_b32 s8, s6 -; GCN32-NEXT: s_mov_b32 s9, s7 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB4_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_mul_lo_u32 v3, s3, v0 -; GCN32-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN32-NEXT: v_readfirstlane_b32 s0, v1 -; GCN32-NEXT: v_readfirstlane_b32 s1, v2 -; GCN32-NEXT: s_mov_b32 s7, 0x31016000 -; GCN32-NEXT: s_mov_b32 s6, -1 -; GCN32-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GCN32-NEXT: v_add_co_u32_e64 v0, vcc_lo, s0, v0 -; GCN32-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: add_i64_uniform: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB4_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mul_i32 s8, s3, s1 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_add_i32 s9, s9, s8 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s9 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s8, s6 +; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_add_x2 v[1:2], off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB4_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 +; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 +; GFX1032-NEXT: v_add_co_u32_e64 v0, vcc_lo, s0, v0 +; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032-NEXT: s_endpgm entry: %old = atomicrmw add i64 addrspace(1)* %inout, i64 %additive acq_rel store i64 %old, i64 addrspace(1)* %out @@ -1165,78 +1165,78 @@ define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspac ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GCN64-LABEL: sub_i32_constant: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN64-NEXT: s_mov_b64 s[6:7], exec -; GCN64-NEXT: ; implicit-def: $vgpr1 -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN64-NEXT: s_cbranch_execz BB6_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_mul_i32 s6, s6, 5 -; GCN64-NEXT: s_mov_b32 s10, -1 -; GCN64-NEXT: v_mov_b32_e32 v1, s6 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mov_b32 s8, s2 -; GCN64-NEXT: s_mov_b32 s9, s3 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB6_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_readfirstlane_b32 s2, v1 -; GCN64-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GCN64-NEXT: s_mov_b32 s3, 0x31016000 -; GCN64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GCN64-NEXT: s_mov_b32 s2, -1 -; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: sub_i32_constant: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz BB6_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB6_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: sub_i32_constant: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN32-NEXT: s_mov_b32 s5, exec_lo -; GCN32-NEXT: ; implicit-def: $vgpr1 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB6_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 -; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_mul_i32 s5, s5, 5 -; GCN32-NEXT: s_mov_b32 s10, -1 -; GCN32-NEXT: v_mov_b32_e32 v1, s5 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mov_b32 s8, s2 -; GCN32-NEXT: s_mov_b32 s9, s3 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB6_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_readfirstlane_b32 s2, v1 -; GCN32-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GCN32-NEXT: s_mov_b32 s3, 0x31016000 -; GCN32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GCN32-NEXT: s_mov_b32 s2, -1 -; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: sub_i32_constant: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB6_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB6_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm entry: %old = atomicrmw sub i32 addrspace(1)* %inout, i32 5 acq_rel store i32 %old, i32 addrspace(1)* %out @@ -1349,82 +1349,82 @@ define amdgpu_kernel void @sub_i32_uniform(i32 addrspace(1)* %out, i32 addrspace ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; -; GCN64-LABEL: sub_i32_uniform: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: s_clause 0x1 -; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN64-NEXT: s_load_dword s2, s[0:1], 0x34 -; GCN64-NEXT: s_mov_b64 s[8:9], exec -; GCN64-NEXT: ; implicit-def: $vgpr1 -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN64-NEXT: s_cbranch_execz BB7_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: s_bcnt1_i32_b64 s3, s[8:9] -; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mul_i32 s3, s2, s3 -; GCN64-NEXT: s_mov_b32 s10, -1 -; GCN64-NEXT: v_mov_b32_e32 v1, s3 -; GCN64-NEXT: s_mov_b32 s8, s6 -; GCN64-NEXT: s_mov_b32 s9, s7 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB7_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN64-NEXT: v_readfirstlane_b32 s0, v1 -; GCN64-NEXT: s_mov_b32 s7, 0x31016000 -; GCN64-NEXT: s_mov_b32 s6, -1 -; GCN64-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GCN64-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: sub_i32_uniform: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: ; implicit-def: $vgpr1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz BB7_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[8:9] +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mul_i32 s3, s2, s3 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: v_mov_b32_e32 v1, s3 +; GFX1064-NEXT: s_mov_b32 s8, s6 +; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB7_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1064-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: sub_i32_uniform: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: s_clause 0x1 -; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN32-NEXT: s_load_dword s2, s[0:1], 0x34 -; GCN32-NEXT: s_mov_b32 s3, exec_lo -; GCN32-NEXT: ; implicit-def: $vgpr1 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB7_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: s_bcnt1_i32_b32 s1, s3 -; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mul_i32 s1, s2, s1 -; GCN32-NEXT: s_mov_b32 s10, -1 -; GCN32-NEXT: v_mov_b32_e32 v1, s1 -; GCN32-NEXT: s_mov_b32 s8, s6 -; GCN32-NEXT: s_mov_b32 s9, s7 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB7_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN32-NEXT: v_readfirstlane_b32 s0, v1 -; GCN32-NEXT: s_mov_b32 s7, 0x31016000 -; GCN32-NEXT: s_mov_b32 s6, -1 -; GCN32-NEXT: v_sub_nc_u32_e32 v0, s0, v0 -; GCN32-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: sub_i32_uniform: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB7_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s3 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: s_mov_b32 s8, s6 +; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB7_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s0, v0 +; GFX1032-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX1032-NEXT: s_endpgm entry: %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %subitive acq_rel store i32 %old, i32 addrspace(1)* %out @@ -1559,127 +1559,127 @@ define amdgpu_kernel void @sub_i32_varying(i32 addrspace(1)* %out, i32 addrspace ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GCN64-LABEL: sub_i32_varying: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: v_mov_b32_e32 v1, v0 -; GCN64-NEXT: s_not_b64 exec, exec -; GCN64-NEXT: v_mov_b32_e32 v1, 0 -; GCN64-NEXT: s_not_b64 exec, exec -; GCN64-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN64-NEXT: v_mov_b32_e32 v3, 0 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN64-NEXT: v_mov_b32_e32 v2, v1 -; GCN64-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GCN64-NEXT: v_readlane_b32 s4, v1, 31 -; GCN64-NEXT: v_mov_b32_e32 v2, s4 -; GCN64-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf -; GCN64-NEXT: v_readlane_b32 s6, v1, 15 -; GCN64-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GCN64-NEXT: s_mov_b64 exec, s[2:3] -; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN64-NEXT: v_readlane_b32 s7, v1, 31 -; GCN64-NEXT: v_writelane_b32 v3, s6, 16 -; GCN64-NEXT: s_mov_b64 exec, s[4:5] -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GCN64-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN64-NEXT: v_readlane_b32 s8, v1, 47 -; GCN64-NEXT: v_readlane_b32 s9, v1, 63 -; GCN64-NEXT: v_writelane_b32 v3, s7, 32 -; GCN64-NEXT: s_mov_b64 exec, s[4:5] -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 -; GCN64-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN64-NEXT: s_mov_b32 s4, s9 -; GCN64-NEXT: v_writelane_b32 v3, s8, 48 -; GCN64-NEXT: s_mov_b64 exec, s[6:7] -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_mov_b32 s6, -1 -; GCN64-NEXT: ; implicit-def: $vgpr0 -; GCN64-NEXT: s_and_saveexec_b64 s[8:9], vcc -; GCN64-NEXT: s_cbranch_execz BB8_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: v_mov_b32_e32 v0, s4 -; GCN64-NEXT: s_mov_b32 s7, 0x31016000 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mov_b32 s4, s2 -; GCN64-NEXT: s_mov_b32 s5, s3 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB8_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_readfirstlane_b32 s2, v0 -; GCN64-NEXT: v_mov_b32_e32 v0, v3 -; GCN64-NEXT: s_mov_b32 s3, 0x31016000 -; GCN64-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GCN64-NEXT: s_mov_b32 s2, s6 -; GCN64-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: sub_i32_varying: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-NEXT: s_not_b64 exec, exec +; GFX1064-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s4, v1, 31 +; GFX1064-NEXT: v_mov_b32_e32 v2, s4 +; GFX1064-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xc bank_mask:0xf +; GFX1064-NEXT: v_readlane_b32 s6, v1, 15 +; GFX1064-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1064-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_readlane_b32 s7, v1, 31 +; GFX1064-NEXT: v_writelane_b32 v3, s6, 16 +; GFX1064-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1064-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GFX1064-NEXT: v_readlane_b32 s8, v1, 47 +; GFX1064-NEXT: v_readlane_b32 s9, v1, 63 +; GFX1064-NEXT: v_writelane_b32 v3, s7, 32 +; GFX1064-NEXT: s_mov_b64 exec, s[4:5] +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 +; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 +; GFX1064-NEXT: s_mov_b32 s4, s9 +; GFX1064-NEXT: v_writelane_b32 v3, s8, 48 +; GFX1064-NEXT: s_mov_b64 exec, s[6:7] +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: ; implicit-def: $vgpr0 +; GFX1064-NEXT: s_and_saveexec_b64 s[8:9], vcc +; GFX1064-NEXT: s_cbranch_execz BB8_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s4, s2 +; GFX1064-NEXT: s_mov_b32 s5, s3 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB8_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1064-NEXT: v_mov_b32_e32 v0, v3 +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1064-NEXT: s_mov_b32 s2, s6 +; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: sub_i32_varying: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: v_mov_b32_e32 v1, v0 -; GCN32-NEXT: s_not_b32 exec_lo, exec_lo -; GCN32-NEXT: v_mov_b32_e32 v1, 0 -; GCN32-NEXT: s_not_b32 exec_lo, exec_lo -; GCN32-NEXT: s_or_saveexec_b32 s2, -1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GCN32-NEXT: v_mov_b32_e32 v2, v1 -; GCN32-NEXT: v_permlanex16_b32 v2, v2, -1, -1 -; GCN32-NEXT: s_mov_b32 exec_lo, s2 -; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN32-NEXT: s_or_saveexec_b32 s4, -1 -; GCN32-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf -; GCN32-NEXT: v_mov_b32_e32 v3, 0 -; GCN32-NEXT: v_readlane_b32 s5, v1, 15 -; GCN32-NEXT: v_readlane_b32 s6, v1, 31 -; GCN32-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf -; GCN32-NEXT: s_mov_b32 exec_lo, s4 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GCN32-NEXT: s_or_saveexec_b32 s4, -1 -; GCN32-NEXT: v_writelane_b32 v3, s5, 16 -; GCN32-NEXT: s_mov_b32 exec_lo, s4 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_mov_b32 s4, s6 -; GCN32-NEXT: s_mov_b32 s6, -1 -; GCN32-NEXT: ; implicit-def: $vgpr0 -; GCN32-NEXT: s_and_saveexec_b32 s8, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB8_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: v_mov_b32_e32 v0, s4 -; GCN32-NEXT: s_mov_b32 s7, 0x31016000 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mov_b32 s4, s2 -; GCN32-NEXT: s_mov_b32 s5, s3 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB8_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s8 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_readfirstlane_b32 s2, v0 -; GCN32-NEXT: v_mov_b32_e32 v0, v3 -; GCN32-NEXT: s_mov_b32 s3, 0x31016000 -; GCN32-NEXT: v_sub_nc_u32_e32 v0, s2, v0 -; GCN32-NEXT: s_mov_b32 s2, s6 -; GCN32-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: sub_i32_varying: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032-NEXT: v_add_nc_u32_dpp v1, v2, v1 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xf +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_readlane_b32 s5, v1, 15 +; GFX1032-NEXT: v_readlane_b32 s6, v1, 31 +; GFX1032-NEXT: v_mov_b32_dpp v3, v1 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX1032-NEXT: s_or_saveexec_b32 s4, -1 +; GFX1032-NEXT: v_writelane_b32 v3, s5, 16 +; GFX1032-NEXT: s_mov_b32 exec_lo, s4 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_mov_b32 s4, s6 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: ; implicit-def: $vgpr0 +; GFX1032-NEXT: s_and_saveexec_b32 s8, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB8_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s4, s2 +; GFX1032-NEXT: s_mov_b32 s5, s3 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB8_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 +; GFX1032-NEXT: v_mov_b32_e32 v0, v3 +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 +; GFX1032-NEXT: s_mov_b32 s2, s6 +; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %old = atomicrmw sub i32 addrspace(1)* %inout, i32 %lane acq_rel @@ -1806,86 +1806,86 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspac ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GCN64-LABEL: sub_i64_constant: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN64-NEXT: s_mov_b64 s[6:7], exec -; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN64-NEXT: s_cbranch_execz BB9_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN64-NEXT: v_mov_b32_e32 v2, 0 -; GCN64-NEXT: s_mul_i32 s6, s6, 5 -; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: v_mov_b32_e32 v1, s6 -; GCN64-NEXT: s_mov_b32 s10, -1 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mov_b32 s8, s2 -; GCN64-NEXT: s_mov_b32 s9, s3 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB9_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_readfirstlane_b32 s2, v1 -; GCN64-NEXT: v_mul_u32_u24_e32 v1, 5, v0 -; GCN64-NEXT: v_readfirstlane_b32 s3, v2 -; GCN64-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 -; GCN64-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 -; GCN64-NEXT: s_mov_b32 s2, -1 -; GCN64-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc -; GCN64-NEXT: s_mov_b32 s3, 0x31016000 -; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: sub_i64_constant: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s7, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz BB9_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: v_mov_b32_e32 v1, s6 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mov_b32 s8, s2 +; GFX1064-NEXT: s_mov_b32 s9, s3 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB9_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1064-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 +; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s2, v1 +; GFX1064-NEXT: s_mov_b32 s2, -1 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v2, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: sub_i64_constant: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GCN32-NEXT: s_mov_b32 s5, exec_lo -; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB9_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: s_bcnt1_i32_b32 s5, s5 -; GCN32-NEXT: v_mov_b32_e32 v2, 0 -; GCN32-NEXT: s_mul_i32 s5, s5, 5 -; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: v_mov_b32_e32 v1, s5 -; GCN32-NEXT: s_mov_b32 s10, -1 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mov_b32 s8, s2 -; GCN32-NEXT: s_mov_b32 s9, s3 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB9_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s4 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_readfirstlane_b32 s2, v1 -; GCN32-NEXT: v_mul_u32_u24_e32 v1, 5, v0 -; GCN32-NEXT: v_readfirstlane_b32 s3, v2 -; GCN32-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 -; GCN32-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 -; GCN32-NEXT: s_mov_b32 s2, -1 -; GCN32-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo -; GCN32-NEXT: s_mov_b32 s3, 0x31016000 -; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: sub_i64_constant: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s5, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB9_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mov_b32 s8, s2 +; GFX1032-NEXT: s_mov_b32 s9, s3 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB9_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 +; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 +; GFX1032-NEXT: v_mul_hi_u32_u24_e32 v2, 5, v0 +; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s2, v1 +; GFX1032-NEXT: s_mov_b32 s2, -1 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v2, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX1032-NEXT: s_endpgm entry: %old = atomicrmw sub i64 addrspace(1)* %inout, i64 5 acq_rel store i64 %old, i64 addrspace(1)* %out @@ -2029,100 +2029,100 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; -; GCN64-LABEL: sub_i64_uniform: -; GCN64: ; %bb.0: ; %entry -; GCN64-NEXT: s_clause 0x1 -; GCN64-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN64-NEXT: s_mov_b64 s[8:9], exec -; GCN64-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN64-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GCN64-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 -; GCN64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GCN64-NEXT: s_and_saveexec_b64 s[0:1], vcc -; GCN64-NEXT: s_cbranch_execz BB10_2 -; GCN64-NEXT: ; %bb.1: -; GCN64-NEXT: s_bcnt1_i32_b64 s8, s[8:9] -; GCN64-NEXT: s_mov_b32 s11, 0x31016000 -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: s_mul_i32 s9, s3, s8 -; GCN64-NEXT: s_mul_hi_u32 s10, s2, s8 -; GCN64-NEXT: s_mul_i32 s8, s2, s8 -; GCN64-NEXT: s_add_i32 s10, s10, s9 -; GCN64-NEXT: v_mov_b32_e32 v1, s8 -; GCN64-NEXT: v_mov_b32_e32 v2, s10 -; GCN64-NEXT: s_mov_b32 s10, -1 -; GCN64-NEXT: s_mov_b32 s8, s6 -; GCN64-NEXT: s_mov_b32 s9, s7 -; GCN64-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN64-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN64-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc -; GCN64-NEXT: s_waitcnt vmcnt(0) -; GCN64-NEXT: buffer_gl0_inv -; GCN64-NEXT: buffer_gl1_inv -; GCN64-NEXT: BB10_2: -; GCN64-NEXT: s_waitcnt_depctr 0xffe3 -; GCN64-NEXT: s_or_b64 exec, exec, s[0:1] -; GCN64-NEXT: s_waitcnt lgkmcnt(0) -; GCN64-NEXT: v_mul_lo_u32 v3, s3, v0 -; GCN64-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN64-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN64-NEXT: v_readfirstlane_b32 s0, v1 -; GCN64-NEXT: v_readfirstlane_b32 s1, v2 -; GCN64-NEXT: s_mov_b32 s7, 0x31016000 -; GCN64-NEXT: s_mov_b32 s6, -1 -; GCN64-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GCN64-NEXT: v_sub_co_u32_e64 v0, vcc, s0, v0 -; GCN64-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc -; GCN64-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GCN64-NEXT: s_endpgm +; GFX1064-LABEL: sub_i64_uniform: +; GFX1064: ; %bb.0: ; %entry +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, s9, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc +; GFX1064-NEXT: s_cbranch_execz BB10_2 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_bcnt1_i32_b64 s8, s[8:9] +; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_mul_i32 s9, s3, s8 +; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX1064-NEXT: s_mul_i32 s8, s2, s8 +; GFX1064-NEXT: s_add_i32 s10, s10, s9 +; GFX1064-NEXT: v_mov_b32_e32 v1, s8 +; GFX1064-NEXT: v_mov_b32_e32 v2, s10 +; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: s_mov_b32 s8, s6 +; GFX1064-NEXT: s_mov_b32 s9, s7 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: BB10_2: +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 +; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX1064-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1064-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1064-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1064-NEXT: s_mov_b32 s6, -1 +; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3 +; GFX1064-NEXT: v_sub_co_u32_e64 v0, vcc, s0, v0 +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc +; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1064-NEXT: s_endpgm ; -; GCN32-LABEL: sub_i64_uniform: -; GCN32: ; %bb.0: ; %entry -; GCN32-NEXT: s_clause 0x1 -; GCN32-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 -; GCN32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN32-NEXT: s_mov_b32 s8, exec_lo -; GCN32-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GCN32-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GCN32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GCN32-NEXT: s_and_saveexec_b32 s0, vcc_lo -; GCN32-NEXT: s_cbranch_execz BB10_2 -; GCN32-NEXT: ; %bb.1: -; GCN32-NEXT: s_bcnt1_i32_b32 s1, s8 -; GCN32-NEXT: s_mov_b32 s11, 0x31016000 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: s_mul_i32 s8, s3, s1 -; GCN32-NEXT: s_mul_hi_u32 s9, s2, s1 -; GCN32-NEXT: s_mul_i32 s1, s2, s1 -; GCN32-NEXT: s_add_i32 s9, s9, s8 -; GCN32-NEXT: v_mov_b32_e32 v1, s1 -; GCN32-NEXT: v_mov_b32_e32 v2, s9 -; GCN32-NEXT: s_mov_b32 s10, -1 -; GCN32-NEXT: s_mov_b32 s8, s6 -; GCN32-NEXT: s_mov_b32 s9, s7 -; GCN32-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN32-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN32-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc -; GCN32-NEXT: s_waitcnt vmcnt(0) -; GCN32-NEXT: buffer_gl0_inv -; GCN32-NEXT: buffer_gl1_inv -; GCN32-NEXT: BB10_2: -; GCN32-NEXT: s_waitcnt_depctr 0xffe3 -; GCN32-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GCN32-NEXT: s_waitcnt lgkmcnt(0) -; GCN32-NEXT: v_mul_lo_u32 v3, s3, v0 -; GCN32-NEXT: v_mul_hi_u32 v4, s2, v0 -; GCN32-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN32-NEXT: v_readfirstlane_b32 s0, v1 -; GCN32-NEXT: v_readfirstlane_b32 s1, v2 -; GCN32-NEXT: s_mov_b32 s7, 0x31016000 -; GCN32-NEXT: s_mov_b32 s6, -1 -; GCN32-NEXT: v_add_nc_u32_e32 v1, v4, v3 -; GCN32-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s0, v0 -; GCN32-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo -; GCN32-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GCN32-NEXT: s_endpgm +; GFX1032-LABEL: sub_i64_uniform: +; GFX1032: ; %bb.0: ; %entry +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX1032-NEXT: s_mov_b32 s8, exec_lo +; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo +; GFX1032-NEXT: s_cbranch_execz BB10_2 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_bcnt1_i32_b32 s1, s8 +; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_mul_i32 s8, s3, s1 +; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 +; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_add_i32 s9, s9, s8 +; GFX1032-NEXT: v_mov_b32_e32 v1, s1 +; GFX1032-NEXT: v_mov_b32_e32 v2, s9 +; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: s_mov_b32 s8, s6 +; GFX1032-NEXT: s_mov_b32 s9, s7 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: buffer_atomic_sub_x2 v[1:2], off, s[8:11], 0 glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: BB10_2: +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 +; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v0 +; GFX1032-NEXT: v_mul_lo_u32 v0, s2, v0 +; GFX1032-NEXT: v_readfirstlane_b32 s0, v1 +; GFX1032-NEXT: v_readfirstlane_b32 s1, v2 +; GFX1032-NEXT: s_mov_b32 s7, 0x31016000 +; GFX1032-NEXT: s_mov_b32 s6, -1 +; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3 +; GFX1032-NEXT: v_sub_co_u32_e64 v0, vcc_lo, s0, v0 +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo +; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX1032-NEXT: s_endpgm entry: %old = atomicrmw sub i64 addrspace(1)* %inout, i64 %subitive acq_rel store i64 %old, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index bb56be5f12a4..5590c4ee47bd 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -2,8 +2,8 @@ ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX1032 %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -1169,35 +1169,20 @@ define amdgpu_kernel void @add_i64_varying(i64 addrspace(1)* %out) { ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: add_i64_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: add_i64_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX10-LABEL: add_i64_varying: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 @@ -2376,35 +2361,20 @@ define amdgpu_kernel void @sub_i64_varying(i64 addrspace(1)* %out) { ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; -; GFX1064-LABEL: sub_i64_varying: -; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo -; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] -; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1064-NEXT: s_endpgm -; -; GFX1032-LABEL: sub_i64_varying: -; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo -; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] -; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; GFX1032-NEXT: s_endpgm +; GFX10-LABEL: sub_i64_varying: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX10-NEXT: s_mov_b32 s3, 0x31016000 +; GFX10-NEXT: s_mov_b32 s2, -1 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GFX10-NEXT: s_endpgm entry: %lane = call i32 @llvm.amdgcn.workitem.id.x() %zext = zext i32 %lane to i64 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index f303aad8389a..ddbf168e7734 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-- -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX8 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizations=true -verify-machineinstrs -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s @@ -46,73 +46,39 @@ define amdgpu_ps void @add_i32_constant(<4 x i32> inreg %out, <4 x i32> inreg %i ; GFX7-NEXT: BB0_6: ; %UnifiedReturnBlock ; GFX7-NEXT: s_endpgm ; -; GFX8-LABEL: add_i32_constant: -; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: s_mov_b64 s[10:11], exec -; GFX8-NEXT: ; implicit-def: $vgpr0 -; GFX8-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX8-NEXT: s_cbranch_execz BB0_4 -; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_mov_b64 s[12:13], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX8-NEXT: s_cbranch_execz BB0_3 -; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_bcnt1_i32_b64 s12, s[12:13] -; GFX8-NEXT: s_mul_i32 s12, s12, 5 -; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX8-NEXT: BB0_3: -; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] -; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s4, v1 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX8-NEXT: BB0_4: ; %Flow -; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX8-NEXT: s_wqm_b64 s[4:5], -1 -; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX8-NEXT: s_cbranch_vccnz BB0_6 -; GFX8-NEXT: ; %bb.5: ; %if -; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX8-NEXT: BB0_6: ; %UnifiedReturnBlock -; GFX8-NEXT: s_endpgm -; -; GFX9-LABEL: add_i32_constant: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_mov_b64 s[10:11], exec -; GFX9-NEXT: ; implicit-def: $vgpr0 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] -; GFX9-NEXT: s_cbranch_execz BB0_4 -; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_mov_b64 s[12:13], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc -; GFX9-NEXT: s_cbranch_execz BB0_3 -; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_bcnt1_i32_b64 s12, s[12:13] -; GFX9-NEXT: s_mul_i32 s12, s12, 5 -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc -; GFX9-NEXT: BB0_3: -; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 -; GFX9-NEXT: BB0_4: ; %Flow -; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] -; GFX9-NEXT: s_wqm_b64 s[4:5], -1 -; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] -; GFX9-NEXT: s_cbranch_vccnz BB0_6 -; GFX9-NEXT: ; %bb.5: ; %if -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 -; GFX9-NEXT: BB0_6: ; %UnifiedReturnBlock -; GFX9-NEXT: s_endpgm +; GFX89-LABEL: add_i32_constant: +; GFX89: ; %bb.0: ; %entry +; GFX89-NEXT: s_mov_b64 s[10:11], exec +; GFX89-NEXT: ; implicit-def: $vgpr0 +; GFX89-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] +; GFX89-NEXT: s_cbranch_execz BB0_4 +; GFX89-NEXT: ; %bb.1: +; GFX89-NEXT: s_mov_b64 s[12:13], exec +; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s12, 0 +; GFX89-NEXT: v_mbcnt_hi_u32_b32 v0, s13, v0 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX89-NEXT: ; implicit-def: $vgpr1 +; GFX89-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GFX89-NEXT: s_cbranch_execz BB0_3 +; GFX89-NEXT: ; %bb.2: +; GFX89-NEXT: s_bcnt1_i32_b64 s12, s[12:13] +; GFX89-NEXT: s_mul_i32 s12, s12, 5 +; GFX89-NEXT: v_mov_b32_e32 v1, s12 +; GFX89-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc +; GFX89-NEXT: BB0_3: +; GFX89-NEXT: s_or_b64 exec, exec, s[10:11] +; GFX89-NEXT: s_waitcnt vmcnt(0) +; GFX89-NEXT: v_readfirstlane_b32 s4, v1 +; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX89-NEXT: BB0_4: ; %Flow +; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX89-NEXT: s_wqm_b64 s[4:5], -1 +; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX89-NEXT: s_cbranch_vccnz BB0_6 +; GFX89-NEXT: ; %bb.5: ; %if +; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GFX89-NEXT: BB0_6: ; %UnifiedReturnBlock +; GFX89-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i32_constant: ; GFX1064: ; %bb.0: ; %entry -- GitLab From cfa65f77cbcd1185bdd3860ff326db37066a519a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Fri, 19 Mar 2021 13:47:43 +0200 Subject: [PATCH 0162/1000] [cmake] Enable Clang warnings about redundant semicolons This matches what GCC warns about when -pedantic is enabled. This should avoid such redundant semicolons creeping into the codebase. Differential Revision: https://reviews.llvm.org/D98941 --- llvm/cmake/modules/HandleLLVMOptions.cmake | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index d85fe137c191..c250a776517d 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -668,6 +668,11 @@ if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL)) if (LLVM_ENABLE_PEDANTIC AND LLVM_COMPILER_IS_GCC_COMPATIBLE) append("-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) append("-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + + # GCC warns about redundant toplevel semicolons (enabled by -pedantic + # above), while Clang doesn't. Enable the corresponding Clang option to + # pick up on these even in builds with Clang. + add_flag_if_supported("-Wc++98-compat-extra-semi" CXX98_COMPAT_EXTRA_SEMI_FLAG) endif() add_flag_if_supported("-Wimplicit-fallthrough" IMPLICIT_FALLTHROUGH_FLAG) -- GitLab From 7a154c32301de7241ea9ea7b05afad0bbdb76f9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Sat, 27 Feb 2021 14:46:16 +0200 Subject: [PATCH 0163/1000] [libcxx] [test] Account for differences in a trailing slash in weakly_canonical This seems to be a documented quirk in libc++'s implementation of weakly_canonical (in a comment in the weakly_canonical test). Together with a difference between windows and posix regarding whether paths can go through nonexistent dirs, this results in a difference in a trailing slash. Just document this as expected, and degrade the comment from fixme to a note, as MS STL and libstdc++ behave in the same way. Differential Revision: https://reviews.llvm.org/D98642 --- .../fs.op.funcs/fs.op.exists/exists.pass.cpp | 11 +++++++++++ .../fs.op.funcs/fs.op.relative/relative.pass.cpp | 10 ++++++++++ .../fs.op.weakly_canonical/weakly_canonical.pass.cpp | 10 +++++++++- 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp index a116d0886dd4..d198d136b21e 100644 --- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp +++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.exists/exists.pass.cpp @@ -66,6 +66,17 @@ TEST_CASE(test_exist_not_found) const path p = static_env.DNE; TEST_CHECK(exists(p) == false); + TEST_CHECK(exists(static_env.Dir) == true); + TEST_CHECK(exists(static_env.Dir / "dne") == false); + // Whether /dne/.. is considered to exist or not is not necessarily + // something we need to define, but the platform specific behaviour + // does affect a few other tests, so clarify the root cause here. +#ifdef _WIN32 + TEST_CHECK(exists(static_env.Dir / "dne" / "..") == true); +#else + TEST_CHECK(exists(static_env.Dir / "dne" / "..") == false); +#endif + std::error_code ec = GetTestEC(); TEST_CHECK(exists(p, ec) == false); TEST_CHECK(!ec); diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp index aba9023bf8b4..0c056057927d 100644 --- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp +++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.relative/relative.pass.cpp @@ -95,7 +95,17 @@ TEST_CASE(test_signature_9) { static_test_env static_env; fs::path p(static_env.SymlinkToDir / "dir2/../dir2/DNE/.."); const fs::path output = fs::weakly_canonical(p); + // weakly_canonical has a quirk - if the path is considered to exist, + // it's returned without a trailing slash, otherwise it's returned with + // one (see a note in fs.op.weakly_canonical/weakly_canonical.pass.cpp). + // On Windows, a path like existent/nonexistentsubdir/.. is considered + // to exist, on posix it's considered to not exist. Therefore, the + // result here differs in the trailing slash. +#ifdef _WIN32 + TEST_CHECK(output == fs::path::string_type(static_env.Dir2)); +#else TEST_CHECK(output == fs::path::string_type(static_env.Dir2 / "")); +#endif } TEST_CASE(test_signature_10) { diff --git a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp index 983ad7bf0137..b0909da01171 100644 --- a/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp +++ b/libcxx/test/std/input.output/filesystems/fs.op.funcs/fs.op.weakly_canonical/weakly_canonical.pass.cpp @@ -46,12 +46,20 @@ int main(int, char**) { {static_env.Dir, static_env.Dir}, {static_env.SymlinkToDir, static_env.Dir}, {static_env.SymlinkToDir / "dir2/.", static_env.Dir / "dir2"}, - // FIXME? If the trailing separator occurs in a part of the path that exists, + // Note: If the trailing separator occurs in a part of the path that exists, // it is omitted. Otherwise it is added to the end of the result. + // MS STL and libstdc++ behave similarly. {static_env.SymlinkToDir / "dir2/./", static_env.Dir / "dir2"}, {static_env.SymlinkToDir / "dir2/DNE/./", static_env.Dir / "dir2/DNE/"}, {static_env.SymlinkToDir / "dir2", static_env.Dir2}, +#ifdef _WIN32 + // On Windows, this path is considered to exist (even though it + // passes through a nonexistent directory), and thus is returned + // without a trailing slash, see the note above. + {static_env.SymlinkToDir / "dir2/../dir2/DNE/..", static_env.Dir2}, +#else {static_env.SymlinkToDir / "dir2/../dir2/DNE/..", static_env.Dir2 / ""}, +#endif {static_env.SymlinkToDir / "dir2/dir3/../DNE/DNE2", static_env.Dir2 / "DNE/DNE2"}, {static_env.Dir / "../dir1", static_env.Dir}, {static_env.Dir / "./.", static_env.Dir}, -- GitLab From 550292ecb19a203eeed90945a8402433882ee1d6 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Wed, 17 Mar 2021 12:30:11 +0000 Subject: [PATCH 0164/1000] [RISCV] Fix missing scalable->fixed-length vector conversion Returning the scalable-vector container type would present problems when the fixed-length INSERT_VECTOR_ELT was used by later operations. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D98776 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 8 +++-- .../CodeGen/RISCV/rvv/fixed-vectors-insert.ll | 36 +++++++++++++++++-- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 8c085425eb0a..6ddad93bc2dd 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2379,8 +2379,12 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SDValue ValInVec; if (IsLegalInsert) { - if (isNullConstant(Idx)) - return DAG.getNode(RISCVISD::VMV_S_XF_VL, DL, ContainerVT, Vec, Val, VL); + if (isNullConstant(Idx)) { + Vec = DAG.getNode(RISCVISD::VMV_S_XF_VL, DL, ContainerVT, Vec, Val, VL); + if (!VecVT.isFixedLengthVector()) + return Vec; + return convertFromScalableVector(VecVT, Vec, DAG, Subtarget); + } ValInVec = DAG.getNode(RISCVISD::VMV_S_XF_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Val, VL); } else { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll index a9bdbd876cee..19b3ef6defff 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -178,7 +178,7 @@ define void @insertelt_v8i64_0(<8 x i64>* %x) { ; RV32-NEXT: vle64.v v28, (a0) ; RV32-NEXT: addi a1, zero, -1 ; RV32-NEXT: vmv.s.x v28, a1 -; RV32-NEXT: vs4r.v v28, (a0) +; RV32-NEXT: vse64.v v28, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_v8i64_0: @@ -235,7 +235,7 @@ define void @insertelt_c6_v8i64_0(<8 x i64>* %x) { ; RV32-NEXT: vle64.v v28, (a0) ; RV32-NEXT: addi a1, zero, 6 ; RV32-NEXT: vmv.s.x v28, a1 -; RV32-NEXT: vs4r.v v28, (a0) +; RV32-NEXT: vse64.v v28, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: insertelt_c6_v8i64_0: @@ -284,3 +284,35 @@ define void @insertelt_c6_v8i64(<8 x i64>* %x, i32 %idx) { store <8 x i64> %b, <8 x i64>* %x ret void } + +; Test that using a insertelement at element 0 by a later operation doesn't +; crash the compiler. +define void @insertelt_c6_v8i64_0_add(<8 x i64>* %x, <8 x i64>* %y) { +; RV32-LABEL: insertelt_c6_v8i64_0_add: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a2, 8, e64,m4,ta,mu +; RV32-NEXT: vle64.v v28, (a0) +; RV32-NEXT: vle64.v v8, (a1) +; RV32-NEXT: addi a1, zero, 6 +; RV32-NEXT: vmv.s.x v28, a1 +; RV32-NEXT: vadd.vv v28, v28, v8 +; RV32-NEXT: vse64.v v28, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: insertelt_c6_v8i64_0_add: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a2, 8, e64,m4,ta,mu +; RV64-NEXT: vle64.v v28, (a0) +; RV64-NEXT: vle64.v v8, (a1) +; RV64-NEXT: addi a1, zero, 6 +; RV64-NEXT: vmv.s.x v28, a1 +; RV64-NEXT: vadd.vv v28, v28, v8 +; RV64-NEXT: vse64.v v28, (a0) +; RV64-NEXT: ret + %a = load <8 x i64>, <8 x i64>* %x + %b = insertelement <8 x i64> %a, i64 6, i32 0 + %c = load <8 x i64>, <8 x i64>* %y + %d = add <8 x i64> %b, %c + store <8 x i64> %d, <8 x i64>* %x + ret void +} -- GitLab From 3bffa2c2aad810637601f3276aa329a77c4dd241 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Fri, 19 Mar 2021 12:27:15 +0000 Subject: [PATCH 0165/1000] [RISCV] Add missing CHECKs to vector test Since the "LMUL-MAX=2" output for some test functions differed between RV32 and RV64, the update_llc_test_checks script failed to emit a unified LMULMAX2 check for them. I'm not sure why it didn't warn about this. This patch also takes the opportunity to add unified RV32/RV64 checks to help shorten the test file when the output for LMULMAX1 and LMULMAX2 is identical but differs between the two ISAs. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D98944 --- .../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 1517 +++++++++-------- 1 file changed, 791 insertions(+), 726 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 7eb49f1b8fe5..33f2e0d3998e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX2 -; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1-RV32 -; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,LMULMAX1-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,LMULMAX2,LMULMAX2-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,LMULMAX2,LMULMAX2-RV64 +; RUN: llc -mtriple=riscv32 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,LMULMAX1,LMULMAX1-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+experimental-v -verify-machineinstrs -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,LMULMAX1,LMULMAX1-RV64 define void @add_v16i8(<16 x i8>* %x, <16 x i8>* %y) { ; CHECK-LABEL: add_v16i8: @@ -943,58 +943,58 @@ define void @mulhu_v4i32(<4 x i32>* %x) { } define void @mulhu_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: mulhu_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI55_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI55_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI55_1) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI55_1) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhu_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, 1035469 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -819 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -819 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -819 -; LMULMAX1-RV64-NEXT: vmv.v.x v26, a1 -; LMULMAX1-RV64-NEXT: lui a1, 1026731 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmulhu.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vmv.v.i v26, 2 -; LMULMAX1-RV64-NEXT: addi a1, zero, 1 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhu_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI55_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI55_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vmulhu.vv v25, v25, v26 +; RV32-NEXT: lui a1, %hi(.LCPI55_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI55_1) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsrl.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhu_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: lui a1, 1035469 +; RV64-NEXT: addiw a1, a1, -819 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -819 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -819 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -819 +; RV64-NEXT: vmv.v.x v26, a1 +; RV64-NEXT: lui a1, 1026731 +; RV64-NEXT: addiw a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1365 +; RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; RV64-NEXT: vmv.s.x v26, a1 +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vmulhu.vv v25, v25, v26 +; RV64-NEXT: vmv.v.i v26, 2 +; RV64-NEXT: addi a1, zero, 1 +; RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; RV64-NEXT: vmv.s.x v26, a1 +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vsrl.vv v25, v25, v26 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = udiv <2 x i64> %a, store <2 x i64> %b, <2 x i64>* %x @@ -1043,33 +1043,33 @@ define void @mulhs_v8i16(<8 x i16>* %x) { } define void @mulhs_v4i32(<4 x i32>* %x) { -; LMULMAX1-RV32-LABEL: mulhs_v4i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI58_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI58_0) -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vmulh.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 31 -; LMULMAX1-RV32-NEXT: vsra.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhs_v4i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI58_0) -; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI58_0) -; LMULMAX1-RV64-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vsra.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 31 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhs_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI58_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI58_0) +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vmulh.vv v25, v25, v26 +; RV32-NEXT: vsrl.vi v26, v25, 31 +; RV32-NEXT: vsra.vi v25, v25, 1 +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: vse32.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhs_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lui a1, %hi(.LCPI58_0) +; RV64-NEXT: addi a1, a1, %lo(.LCPI58_0) +; RV64-NEXT: vle32.v v26, (a1) +; RV64-NEXT: vmulh.vv v25, v25, v26 +; RV64-NEXT: vsra.vi v25, v25, 1 +; RV64-NEXT: vsrl.vi v26, v25, 31 +; RV64-NEXT: vadd.vv v25, v25, v26 +; RV64-NEXT: vse32.v v25, (a0) +; RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = sdiv <4 x i32> %a, store <4 x i32> %b, <4 x i32>* %x @@ -1077,76 +1077,76 @@ define void @mulhs_v4i32(<4 x i32>* %x) { } define void @mulhs_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: mulhs_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI59_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI59_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmul.vv v26, v25, v26 -; LMULMAX1-RV32-NEXT: lui a1, 349525 -; LMULMAX1-RV32-NEXT: addi a2, a1, 1365 -; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.v.x v27, a2 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1366 -; LMULMAX1-RV32-NEXT: vsetvli a2, zero, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.s.x v27, a1 -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmulh.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI59_1) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI59_1) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsrl.vv v26, v25, v26 -; LMULMAX1-RV32-NEXT: addi a1, zero, 1 -; LMULMAX1-RV32-NEXT: vsetvli a2, zero, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.s.x v27, a1 -; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.v.i v28, 0 -; LMULMAX1-RV32-NEXT: vsetivli a1, 3, e32,m1,tu,mu -; LMULMAX1-RV32-NEXT: vslideup.vi v28, v27, 2 -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v28 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhs_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vmv.v.i v26, -1 -; LMULMAX1-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmv.s.x v26, zero -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmul.vv v26, v25, v26 -; LMULMAX1-RV64-NEXT: lui a1, 21845 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a2, a1, 1365 -; LMULMAX1-RV64-NEXT: vmv.v.x v27, a2 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1366 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmv.s.x v27, a1 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: addi a1, zero, 63 -; LMULMAX1-RV64-NEXT: vsrl.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vid.v v27 -; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhs_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI59_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI59_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vmul.vv v26, v25, v26 +; RV32-NEXT: lui a1, 349525 +; RV32-NEXT: addi a2, a1, 1365 +; RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; RV32-NEXT: vmv.v.x v27, a2 +; RV32-NEXT: addi a1, a1, 1366 +; RV32-NEXT: vsetvli a2, zero, e32,m1,ta,mu +; RV32-NEXT: vmv.s.x v27, a1 +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vmulh.vv v25, v25, v27 +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: lui a1, %hi(.LCPI59_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI59_1) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsrl.vv v26, v25, v26 +; RV32-NEXT: addi a1, zero, 1 +; RV32-NEXT: vsetvli a2, zero, e32,m1,ta,mu +; RV32-NEXT: vmv.s.x v27, a1 +; RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV32-NEXT: vmv.v.i v28, 0 +; RV32-NEXT: vsetivli a1, 3, e32,m1,tu,mu +; RV32-NEXT: vslideup.vi v28, v27, 2 +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsra.vv v25, v25, v28 +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhs_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vmv.v.i v26, -1 +; RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu +; RV64-NEXT: vmv.s.x v26, zero +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vmul.vv v26, v25, v26 +; RV64-NEXT: lui a1, 21845 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a2, a1, 1365 +; RV64-NEXT: vmv.v.x v27, a2 +; RV64-NEXT: addi a1, a1, 1366 +; RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu +; RV64-NEXT: vmv.s.x v27, a1 +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vmulh.vv v25, v25, v27 +; RV64-NEXT: vadd.vv v25, v25, v26 +; RV64-NEXT: addi a1, zero, 63 +; RV64-NEXT: vsrl.vx v26, v25, a1 +; RV64-NEXT: vid.v v27 +; RV64-NEXT: vsra.vv v25, v25, v27 +; RV64-NEXT: vadd.vv v25, v25, v26 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = sdiv <2 x i64> %a, store <2 x i64> %b, <2 x i64>* %x @@ -3841,37 +3841,21 @@ define void @extract_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX2-NEXT: vse64.v v26, (a0) ; LMULMAX2-NEXT: ret ; -; LMULMAX1-RV32-LABEL: extract_v4i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a2, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: addi a2, a0, 16 -; LMULMAX1-RV32-NEXT: vle64.v v26, (a2) -; LMULMAX1-RV32-NEXT: vle64.v v27, (a1) -; LMULMAX1-RV32-NEXT: addi a1, a1, 16 -; LMULMAX1-RV32-NEXT: vle64.v v28, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v28 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v27 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vse64.v v26, (a2) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: extract_v4i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi a2, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) -; LMULMAX1-RV64-NEXT: vle64.v v27, (a1) -; LMULMAX1-RV64-NEXT: addi a1, a1, 16 -; LMULMAX1-RV64-NEXT: vle64.v v28, (a1) -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v28 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX1-LABEL: extract_v4i64: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-NEXT: vle64.v v25, (a0) +; LMULMAX1-NEXT: addi a2, a0, 16 +; LMULMAX1-NEXT: vle64.v v26, (a2) +; LMULMAX1-NEXT: vle64.v v27, (a1) +; LMULMAX1-NEXT: addi a1, a1, 16 +; LMULMAX1-NEXT: vle64.v v28, (a1) +; LMULMAX1-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-NEXT: vadd.vv v26, v26, v28 +; LMULMAX1-NEXT: vadd.vv v25, v25, v27 +; LMULMAX1-NEXT: vse64.v v25, (a0) +; LMULMAX1-NEXT: vse64.v v26, (a2) +; LMULMAX1-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = load <4 x i64>, <4 x i64>* %y br label %"compute" @@ -3908,35 +3892,20 @@ define void @mulhu_v32i8(<32 x i8>* %x) { ; LMULMAX2-NEXT: vse8.v v26, (a0) ; LMULMAX2-NEXT: ret ; -; LMULMAX1-RV32-LABEL: mulhu_v32i8: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle8.v v25, (a1) -; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI129_0) -; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI129_0) -; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) -; LMULMAX1-RV32-NEXT: vle8.v v27, (a0) -; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vdivu.vv v26, v27, v26 -; LMULMAX1-RV32-NEXT: vse8.v v26, (a0) -; LMULMAX1-RV32-NEXT: vse8.v v25, (a1) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhu_v32i8: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle8.v v25, (a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI129_0) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI129_0) -; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) -; LMULMAX1-RV64-NEXT: vle8.v v27, (a0) -; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 -; LMULMAX1-RV64-NEXT: vse8.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse8.v v25, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX1-LABEL: mulhu_v32i8: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 16, e8,m1,ta,mu +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle8.v v25, (a1) +; LMULMAX1-NEXT: lui a2, %hi(.LCPI129_0) +; LMULMAX1-NEXT: addi a2, a2, %lo(.LCPI129_0) +; LMULMAX1-NEXT: vle8.v v26, (a2) +; LMULMAX1-NEXT: vle8.v v27, (a0) +; LMULMAX1-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-NEXT: vse8.v v26, (a0) +; LMULMAX1-NEXT: vse8.v v25, (a1) +; LMULMAX1-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = udiv <32 x i8> %a, store <32 x i8> %b, <32 x i8>* %x @@ -3969,35 +3938,20 @@ define void @mulhu_v16i16(<16 x i16>* %x) { ; LMULMAX2-NEXT: vse16.v v26, (a0) ; LMULMAX2-NEXT: ret ; -; LMULMAX1-RV32-LABEL: mulhu_v16i16: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) -; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI130_0) -; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI130_0) -; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) -; LMULMAX1-RV32-NEXT: vle16.v v27, (a0) -; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vdivu.vv v26, v27, v26 -; LMULMAX1-RV32-NEXT: vse16.v v26, (a0) -; LMULMAX1-RV32-NEXT: vse16.v v25, (a1) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhu_v16i16: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI130_0) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI130_0) -; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) -; LMULMAX1-RV64-NEXT: vle16.v v27, (a0) -; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 -; LMULMAX1-RV64-NEXT: vse16.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse16.v v25, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX1-LABEL: mulhu_v16i16: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle16.v v25, (a1) +; LMULMAX1-NEXT: lui a2, %hi(.LCPI130_0) +; LMULMAX1-NEXT: addi a2, a2, %lo(.LCPI130_0) +; LMULMAX1-NEXT: vle16.v v26, (a2) +; LMULMAX1-NEXT: vle16.v v27, (a0) +; LMULMAX1-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-NEXT: vse16.v v26, (a0) +; LMULMAX1-NEXT: vse16.v v25, (a1) +; LMULMAX1-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = udiv <16 x i16> %a, store <16 x i16> %b, <16 x i16>* %x @@ -4086,6 +4040,63 @@ define void @mulhu_v8i32(<8 x i32>* %x) { } define void @mulhu_v4i64(<4 x i64>* %x) { +; LMULMAX2-RV32-LABEL: mulhu_v4i64: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle64.v v26, (a0) +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI132_0) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI132_0) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmulhu.vv v28, v26, v28 +; LMULMAX2-RV32-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, 524288 +; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmv.s.x v30, a1 +; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmv.v.i v8, 0 +; LMULMAX2-RV32-NEXT: vsetivli a1, 6, e32,m2,tu,mu +; LMULMAX2-RV32-NEXT: vslideup.vi v8, v30, 5 +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmulhu.vv v26, v26, v8 +; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI132_1) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI132_1) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vse64.v v26, (a0) +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: mulhu_v4i64: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV64-NEXT: vle64.v v26, (a0) +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI132_0) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI132_0) +; LMULMAX2-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX2-RV64-NEXT: vmulhu.vv v28, v26, v28 +; LMULMAX2-RV64-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: addi a1, zero, -1 +; LMULMAX2-RV64-NEXT: slli a1, a1, 63 +; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e64,m2,ta,mu +; LMULMAX2-RV64-NEXT: vmv.s.x v30, a1 +; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV64-NEXT: vmv.v.i v8, 0 +; LMULMAX2-RV64-NEXT: vsetivli a1, 3, e64,m2,tu,mu +; LMULMAX2-RV64-NEXT: vslideup.vi v8, v30, 2 +; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI132_1) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI132_1) +; LMULMAX2-RV64-NEXT: vle64.v v30, (a1) +; LMULMAX2-RV64-NEXT: vmulhu.vv v26, v26, v8 +; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsrl.vv v26, v26, v30 +; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) +; LMULMAX2-RV64-NEXT: ret +; ; LMULMAX1-RV32-LABEL: mulhu_v4i64: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu @@ -4203,35 +4214,20 @@ define void @mulhs_v32i8(<32 x i8>* %x) { ; LMULMAX2-NEXT: vse8.v v26, (a0) ; LMULMAX2-NEXT: ret ; -; LMULMAX1-RV32-LABEL: mulhs_v32i8: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle8.v v25, (a1) -; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI133_0) -; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI133_0) -; LMULMAX1-RV32-NEXT: vle8.v v26, (a2) -; LMULMAX1-RV32-NEXT: vle8.v v27, (a0) -; LMULMAX1-RV32-NEXT: vdivu.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vdivu.vv v26, v27, v26 -; LMULMAX1-RV32-NEXT: vse8.v v26, (a0) -; LMULMAX1-RV32-NEXT: vse8.v v25, (a1) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhs_v32i8: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 16, e8,m1,ta,mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle8.v v25, (a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI133_0) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI133_0) -; LMULMAX1-RV64-NEXT: vle8.v v26, (a2) -; LMULMAX1-RV64-NEXT: vle8.v v27, (a0) -; LMULMAX1-RV64-NEXT: vdivu.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vdivu.vv v26, v27, v26 -; LMULMAX1-RV64-NEXT: vse8.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse8.v v25, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX1-LABEL: mulhs_v32i8: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 16, e8,m1,ta,mu +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle8.v v25, (a1) +; LMULMAX1-NEXT: lui a2, %hi(.LCPI133_0) +; LMULMAX1-NEXT: addi a2, a2, %lo(.LCPI133_0) +; LMULMAX1-NEXT: vle8.v v26, (a2) +; LMULMAX1-NEXT: vle8.v v27, (a0) +; LMULMAX1-NEXT: vdivu.vv v25, v25, v26 +; LMULMAX1-NEXT: vdivu.vv v26, v27, v26 +; LMULMAX1-NEXT: vse8.v v26, (a0) +; LMULMAX1-NEXT: vse8.v v25, (a1) +; LMULMAX1-NEXT: ret %a = load <32 x i8>, <32 x i8>* %x %b = udiv <32 x i8> %a, store <32 x i8> %b, <32 x i8>* %x @@ -4253,35 +4249,20 @@ define void @mulhs_v16i16(<16 x i16>* %x) { ; LMULMAX2-NEXT: vse16.v v26, (a0) ; LMULMAX2-NEXT: ret ; -; LMULMAX1-RV32-LABEL: mulhs_v16i16: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle16.v v25, (a1) -; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI134_0) -; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI134_0) -; LMULMAX1-RV32-NEXT: vle16.v v26, (a2) -; LMULMAX1-RV32-NEXT: vle16.v v27, (a0) -; LMULMAX1-RV32-NEXT: vdiv.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vdiv.vv v26, v27, v26 -; LMULMAX1-RV32-NEXT: vse16.v v26, (a0) -; LMULMAX1-RV32-NEXT: vse16.v v25, (a1) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhs_v16i16: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle16.v v25, (a1) -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI134_0) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI134_0) -; LMULMAX1-RV64-NEXT: vle16.v v26, (a2) -; LMULMAX1-RV64-NEXT: vle16.v v27, (a0) -; LMULMAX1-RV64-NEXT: vdiv.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vdiv.vv v26, v27, v26 -; LMULMAX1-RV64-NEXT: vse16.v v26, (a0) -; LMULMAX1-RV64-NEXT: vse16.v v25, (a1) -; LMULMAX1-RV64-NEXT: ret +; LMULMAX1-LABEL: mulhs_v16i16: +; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; LMULMAX1-NEXT: addi a1, a0, 16 +; LMULMAX1-NEXT: vle16.v v25, (a1) +; LMULMAX1-NEXT: lui a2, %hi(.LCPI134_0) +; LMULMAX1-NEXT: addi a2, a2, %lo(.LCPI134_0) +; LMULMAX1-NEXT: vle16.v v26, (a2) +; LMULMAX1-NEXT: vle16.v v27, (a0) +; LMULMAX1-NEXT: vdiv.vv v25, v25, v26 +; LMULMAX1-NEXT: vdiv.vv v26, v27, v26 +; LMULMAX1-NEXT: vse16.v v26, (a0) +; LMULMAX1-NEXT: vse16.v v25, (a1) +; LMULMAX1-NEXT: ret %a = load <16 x i16>, <16 x i16>* %x %b = sdiv <16 x i16> %a, store <16 x i16> %b, <16 x i16>* %x @@ -4289,6 +4270,34 @@ define void @mulhs_v16i16(<16 x i16>* %x) { } define void @mulhs_v8i32(<8 x i32>* %x) { +; LMULMAX2-RV32-LABEL: mulhs_v8i32: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v26, (a0) +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI135_0) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI135_0) +; LMULMAX2-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV32-NEXT: vmulh.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vsrl.vi v28, v26, 31 +; LMULMAX2-RV32-NEXT: vsra.vi v26, v26, 1 +; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vse32.v v26, (a0) +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: mulhs_v8i32: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: vsetivli a1, 8, e32,m2,ta,mu +; LMULMAX2-RV64-NEXT: vle32.v v26, (a0) +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI135_0) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI135_0) +; LMULMAX2-RV64-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV64-NEXT: vmulh.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vsra.vi v26, v26, 1 +; LMULMAX2-RV64-NEXT: vsrl.vi v28, v26, 31 +; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vse32.v v26, (a0) +; LMULMAX2-RV64-NEXT: ret +; ; LMULMAX1-RV32-LABEL: mulhs_v8i32: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu @@ -4331,6 +4340,62 @@ define void @mulhs_v8i32(<8 x i32>* %x) { } define void @mulhs_v4i64(<4 x i64>* %x) { +; LMULMAX2-RV32-LABEL: mulhs_v4i64: +; LMULMAX2-RV32: # %bb.0: +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle64.v v26, (a0) +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI136_0) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI136_0) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmul.vv v28, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI136_1) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI136_1) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vmulh.vv v26, v26, v30 +; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI136_2) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI136_2) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v28, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsrl.vv v28, v26, v28 +; LMULMAX2-RV32-NEXT: lui a1, %hi(.LCPI136_3) +; LMULMAX2-RV32-NEXT: addi a1, a1, %lo(.LCPI136_3) +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vle32.v v30, (a1) +; LMULMAX2-RV32-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsra.vv v26, v26, v30 +; LMULMAX2-RV32-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-RV32-NEXT: vse64.v v26, (a0) +; LMULMAX2-RV32-NEXT: ret +; +; LMULMAX2-RV64-LABEL: mulhs_v4i64: +; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu +; LMULMAX2-RV64-NEXT: vle64.v v26, (a0) +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI136_0) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI136_0) +; LMULMAX2-RV64-NEXT: vle64.v v28, (a1) +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI136_1) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI136_1) +; LMULMAX2-RV64-NEXT: vle64.v v30, (a1) +; LMULMAX2-RV64-NEXT: vmul.vv v28, v26, v28 +; LMULMAX2-RV64-NEXT: vmulh.vv v26, v26, v30 +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI136_2) +; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI136_2) +; LMULMAX2-RV64-NEXT: vle64.v v30, (a1) +; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: addi a1, zero, 63 +; LMULMAX2-RV64-NEXT: vsrl.vx v28, v26, a1 +; LMULMAX2-RV64-NEXT: vsra.vv v26, v26, v30 +; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) +; LMULMAX2-RV64-NEXT: ret +; ; LMULMAX1-RV32-LABEL: mulhs_v4i64: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu @@ -5199,24 +5264,24 @@ define void @add_vi_v4i32(<4 x i32>* %x) { } define void @add_vi_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: add_vi_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.v.i v26, -1 -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: add_vi_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vadd.vi v25, v25, -1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: add_vi_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV32-NEXT: vmv.v.i v26, -1 +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: add_vi_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vadd.vi v25, v25, -1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 -1, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -5274,26 +5339,26 @@ define void @add_iv_v4i32(<4 x i32>* %x) { } define void @add_iv_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: add_iv_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI160_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI160_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: add_iv_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vadd.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: add_iv_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI160_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI160_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: add_iv_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vadd.vi v25, v25, 1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 1, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -5450,25 +5515,25 @@ define void @sub_vi_v4i32(<4 x i32>* %x) { } define void @sub_vi_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: sub_vi_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.v.i v26, -1 -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: sub_vi_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi a1, zero, -1 -; LMULMAX1-RV64-NEXT: vsub.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: sub_vi_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV32-NEXT: vmv.v.i v26, -1 +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsub.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: sub_vi_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: addi a1, zero, -1 +; RV64-NEXT: vsub.vx v25, v25, a1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 -1, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -5526,26 +5591,26 @@ define void @sub_iv_v4i32(<4 x i32>* %x) { } define void @sub_iv_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: sub_iv_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI174_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI174_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsub.vv v25, v26, v25 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: sub_iv_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vrsub.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: sub_iv_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI174_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI174_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsub.vv v25, v26, v25 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: sub_iv_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vrsub.vi v25, v25, 1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 1, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -5795,26 +5860,26 @@ define void @and_vi_v4i32(<4 x i32>* %x) { } define void @and_vi_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: and_vi_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI190_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI190_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: and_vi_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vand.vi v25, v25, -2 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: and_vi_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI190_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI190_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vand.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: and_vi_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vand.vi v25, v25, -2 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 -2, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -5872,26 +5937,26 @@ define void @and_iv_v4i32(<4 x i32>* %x) { } define void @and_iv_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: and_iv_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI194_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI194_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vand.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: and_iv_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vand.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: and_iv_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI194_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI194_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vand.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: and_iv_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vand.vi v25, v25, 1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 1, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -6045,26 +6110,26 @@ define void @or_vi_v4i32(<4 x i32>* %x) { } define void @or_vi_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: or_vi_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI204_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI204_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: or_vi_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vor.vi v25, v25, -2 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: or_vi_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI204_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI204_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vor.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: or_vi_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vor.vi v25, v25, -2 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 -2, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -6122,26 +6187,26 @@ define void @or_iv_v4i32(<4 x i32>* %x) { } define void @or_iv_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: or_iv_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI208_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI208_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vor.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: or_iv_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vor.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: or_iv_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI208_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI208_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vor.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: or_iv_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vor.vi v25, v25, 1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 1, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -6295,24 +6360,24 @@ define void @xor_vi_v4i32(<4 x i32>* %x) { } define void @xor_vi_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: xor_vi_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.v.i v26, -1 -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vxor.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: xor_vi_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vxor.vi v25, v25, -1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: xor_vi_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV32-NEXT: vmv.v.i v26, -1 +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vxor.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: xor_vi_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vxor.vi v25, v25, -1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 -1, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -6370,26 +6435,26 @@ define void @xor_iv_v4i32(<4 x i32>* %x) { } define void @xor_iv_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: xor_iv_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI222_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI222_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vxor.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: xor_iv_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vxor.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: xor_iv_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI222_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI222_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vxor.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: xor_iv_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vxor.vi v25, v25, 1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 1, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -6543,26 +6608,26 @@ define void @lshr_vi_v4i32(<4 x i32>* %x) { } define void @lshr_vi_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: lshr_vi_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI232_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI232_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: lshr_vi_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 31 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: lshr_vi_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI232_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI232_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsrl.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: lshr_vi_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vsrl.vi v25, v25, 31 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 31, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -6668,26 +6733,26 @@ define void @ashr_vi_v4i32(<4 x i32>* %x) { } define void @ashr_vi_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: ashr_vi_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI239_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI239_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsra.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: ashr_vi_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vsra.vi v25, v25, 31 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: ashr_vi_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI239_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI239_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsra.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: ashr_vi_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vsra.vi v25, v25, 31 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 31, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -6793,26 +6858,26 @@ define void @shl_vi_v4i32(<4 x i32>* %x) { } define void @shl_vi_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: shl_vi_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI246_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI246_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsll.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: shl_vi_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vsll.vi v25, v25, 31 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: shl_vi_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI246_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI246_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsll.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: shl_vi_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vsll.vi v25, v25, 31 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = insertelement <2 x i64> undef, i64 31, i32 0 %c = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer @@ -7078,33 +7143,33 @@ define void @mulhu_vx_v16i8(<16 x i8>* %x) { } define void @mulhu_vx_v8i16(<8 x i16>* %x) { -; LMULMAX1-RV32-LABEL: mulhu_vx_v8i16: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, 2 -; LMULMAX1-RV32-NEXT: addi a1, a1, 1171 -; LMULMAX1-RV32-NEXT: vmulhu.vx v26, v25, a1 -; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhu_vx_v8i16: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, 2 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1171 -; LMULMAX1-RV64-NEXT: vmulhu.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vsub.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhu_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) +; RV32-NEXT: lui a1, 2 +; RV32-NEXT: addi a1, a1, 1171 +; RV32-NEXT: vmulhu.vx v26, v25, a1 +; RV32-NEXT: vsub.vv v25, v25, v26 +; RV32-NEXT: vsrl.vi v25, v25, 1 +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: vsrl.vi v25, v25, 2 +; RV32-NEXT: vse16.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhu_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV64-NEXT: vle16.v v25, (a0) +; RV64-NEXT: lui a1, 2 +; RV64-NEXT: addiw a1, a1, 1171 +; RV64-NEXT: vmulhu.vx v26, v25, a1 +; RV64-NEXT: vsub.vv v25, v25, v26 +; RV64-NEXT: vsrl.vi v25, v25, 1 +; RV64-NEXT: vadd.vv v25, v25, v26 +; RV64-NEXT: vsrl.vi v25, v25, 2 +; RV64-NEXT: vse16.v v25, (a0) +; RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = udiv <8 x i16> %a, store <8 x i16> %b, <8 x i16>* %x @@ -7112,27 +7177,27 @@ define void @mulhu_vx_v8i16(<8 x i16>* %x) { } define void @mulhu_vx_v4i32(<4 x i32>* %x) { -; LMULMAX1-RV32-LABEL: mulhu_vx_v4i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, 838861 -; LMULMAX1-RV32-NEXT: addi a1, a1, -819 -; LMULMAX1-RV32-NEXT: vmulhu.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhu_vx_v4i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, 838861 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 -; LMULMAX1-RV64-NEXT: vmulhu.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 2 -; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhu_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: lui a1, 838861 +; RV32-NEXT: addi a1, a1, -819 +; RV32-NEXT: vmulhu.vx v25, v25, a1 +; RV32-NEXT: vsrl.vi v25, v25, 2 +; RV32-NEXT: vse32.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhu_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lui a1, 838861 +; RV64-NEXT: addiw a1, a1, -819 +; RV64-NEXT: vmulhu.vx v25, v25, a1 +; RV64-NEXT: vsrl.vi v25, v25, 2 +; RV64-NEXT: vse32.v v25, (a0) +; RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = udiv <4 x i32> %a, store <4 x i32> %b, <4 x i32>* %x @@ -7140,41 +7205,41 @@ define void @mulhu_vx_v4i32(<4 x i32>* %x) { } define void @mulhu_vx_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: mulhu_vx_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI265_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI265_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI265_1) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI265_1) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhu_vx_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, 1026731 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 -; LMULMAX1-RV64-NEXT: vmulhu.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsrl.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhu_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI265_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI265_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vmulhu.vv v25, v25, v26 +; RV32-NEXT: lui a1, %hi(.LCPI265_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI265_1) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsrl.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhu_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: lui a1, 1026731 +; RV64-NEXT: addiw a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, -1365 +; RV64-NEXT: vmulhu.vx v25, v25, a1 +; RV64-NEXT: vsrl.vi v25, v25, 1 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = udiv <2 x i64> %a, store <2 x i64> %b, <2 x i64>* %x @@ -7198,31 +7263,31 @@ define void @mulhs_vx_v16i8(<16 x i8>* %x) { } define void @mulhs_vx_v8i16(<8 x i16>* %x) { -; LMULMAX1-RV32-LABEL: mulhs_vx_v8i16: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle16.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, 5 -; LMULMAX1-RV32-NEXT: addi a1, a1, -1755 -; LMULMAX1-RV32-NEXT: vmulh.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsra.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 15 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhs_vx_v8i16: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle16.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, 5 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -1755 -; LMULMAX1-RV64-NEXT: vmulh.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsra.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 15 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vse16.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhs_vx_v8i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV32-NEXT: vle16.v v25, (a0) +; RV32-NEXT: lui a1, 5 +; RV32-NEXT: addi a1, a1, -1755 +; RV32-NEXT: vmulh.vx v25, v25, a1 +; RV32-NEXT: vsra.vi v25, v25, 1 +; RV32-NEXT: vsrl.vi v26, v25, 15 +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: vse16.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhs_vx_v8i16: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; RV64-NEXT: vle16.v v25, (a0) +; RV64-NEXT: lui a1, 5 +; RV64-NEXT: addiw a1, a1, -1755 +; RV64-NEXT: vmulh.vx v25, v25, a1 +; RV64-NEXT: vsra.vi v25, v25, 1 +; RV64-NEXT: vsrl.vi v26, v25, 15 +; RV64-NEXT: vadd.vv v25, v25, v26 +; RV64-NEXT: vse16.v v25, (a0) +; RV64-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x %b = sdiv <8 x i16> %a, store <8 x i16> %b, <8 x i16>* %x @@ -7230,31 +7295,31 @@ define void @mulhs_vx_v8i16(<8 x i16>* %x) { } define void @mulhs_vx_v4i32(<4 x i32>* %x) { -; LMULMAX1-RV32-LABEL: mulhs_vx_v4i32: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, 629146 -; LMULMAX1-RV32-NEXT: addi a1, a1, -1639 -; LMULMAX1-RV32-NEXT: vmulh.vx v25, v25, a1 -; LMULMAX1-RV32-NEXT: vsrl.vi v26, v25, 31 -; LMULMAX1-RV32-NEXT: vsra.vi v25, v25, 1 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhs_vx_v4i32: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle32.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, 629146 -; LMULMAX1-RV64-NEXT: addiw a1, a1, -1639 -; LMULMAX1-RV64-NEXT: vmulh.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: vsra.vi v25, v25, 1 -; LMULMAX1-RV64-NEXT: vsrl.vi v26, v25, 31 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vse32.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhs_vx_v4i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v25, (a0) +; RV32-NEXT: lui a1, 629146 +; RV32-NEXT: addi a1, a1, -1639 +; RV32-NEXT: vmulh.vx v25, v25, a1 +; RV32-NEXT: vsrl.vi v26, v25, 31 +; RV32-NEXT: vsra.vi v25, v25, 1 +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: vse32.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhs_vx_v4i32: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; RV64-NEXT: vle32.v v25, (a0) +; RV64-NEXT: lui a1, 629146 +; RV64-NEXT: addiw a1, a1, -1639 +; RV64-NEXT: vmulh.vx v25, v25, a1 +; RV64-NEXT: vsra.vi v25, v25, 1 +; RV64-NEXT: vsrl.vi v26, v25, 31 +; RV64-NEXT: vadd.vv v25, v25, v26 +; RV64-NEXT: vse32.v v25, (a0) +; RV64-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x %b = sdiv <4 x i32> %a, store <4 x i32> %b, <4 x i32>* %x @@ -7262,44 +7327,44 @@ define void @mulhs_vx_v4i32(<4 x i32>* %x) { } define void @mulhs_vx_v2i64(<2 x i64>* %x) { -; LMULMAX1-RV32-LABEL: mulhs_vx_v2i64: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI269_0) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI269_0) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmulh.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: lui a1, %hi(.LCPI269_1) -; LMULMAX1-RV32-NEXT: addi a1, a1, %lo(.LCPI269_1) -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) -; LMULMAX1-RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsrl.vv v26, v25, v26 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV32-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: mulhs_vx_v2i64: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: lui a1, 21845 -; LMULMAX1-RV64-NEXT: addiw a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1365 -; LMULMAX1-RV64-NEXT: slli a1, a1, 12 -; LMULMAX1-RV64-NEXT: addi a1, a1, 1366 -; LMULMAX1-RV64-NEXT: vmulh.vx v25, v25, a1 -; LMULMAX1-RV64-NEXT: addi a1, zero, 63 -; LMULMAX1-RV64-NEXT: vsrl.vx v26, v25, a1 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v26 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: ret +; RV32-LABEL: mulhs_vx_v2i64: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vle64.v v25, (a0) +; RV32-NEXT: lui a1, %hi(.LCPI269_0) +; RV32-NEXT: addi a1, a1, %lo(.LCPI269_0) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vmulh.vv v25, v25, v26 +; RV32-NEXT: lui a1, %hi(.LCPI269_1) +; RV32-NEXT: addi a1, a1, %lo(.LCPI269_1) +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu +; RV32-NEXT: vle32.v v26, (a1) +; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV32-NEXT: vsrl.vv v26, v25, v26 +; RV32-NEXT: vadd.vv v25, v25, v26 +; RV32-NEXT: vse64.v v25, (a0) +; RV32-NEXT: ret +; +; RV64-LABEL: mulhs_vx_v2i64: +; RV64: # %bb.0: +; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: lui a1, 21845 +; RV64-NEXT: addiw a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1365 +; RV64-NEXT: slli a1, a1, 12 +; RV64-NEXT: addi a1, a1, 1366 +; RV64-NEXT: vmulh.vx v25, v25, a1 +; RV64-NEXT: addi a1, zero, 63 +; RV64-NEXT: vsrl.vx v26, v25, a1 +; RV64-NEXT: vadd.vv v25, v25, v26 +; RV64-NEXT: vse64.v v25, (a0) +; RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = sdiv <2 x i64> %a, store <2 x i64> %b, <2 x i64>* %x -- GitLab From 00d0315a7cd37e28988950c2cf415c01958858c6 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 19 Mar 2021 10:07:12 -0700 Subject: [PATCH 0166/1000] [SCEV] Factor out a lambda for strict condition splitting [NFC] --- llvm/lib/Analysis/ScalarEvolution.cpp | 42 +++++++++++++++------------ 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 7dd05d0751f1..f12ebe3a8727 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -10028,13 +10028,23 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, bool ProvedNonStrictComparison = false; bool ProvedNonEquality = false; - if (ProvingStrictComparison) { - ProvedNonStrictComparison = - isKnownViaNonRecursiveReasoning(NonStrictPredicate, LHS, RHS); - ProvedNonEquality = - isKnownViaNonRecursiveReasoning(ICmpInst::ICMP_NE, LHS, RHS); + auto SplitAndProve = + [&](std::function Fn) -> bool { + if (!ProvedNonStrictComparison) + ProvedNonStrictComparison = Fn(NonStrictPredicate); + if (!ProvedNonEquality) + ProvedNonEquality = Fn(ICmpInst::ICMP_NE); if (ProvedNonStrictComparison && ProvedNonEquality) return true; + return false; + }; + + if (ProvingStrictComparison) { + auto ProofFn = [&](ICmpInst::Predicate P) { + return isKnownViaNonRecursiveReasoning(P, LHS, RHS); + }; + if (SplitAndProve(ProofFn)) + return true; } // Try to prove (Pred, LHS, RHS) using isImpliedViaGuard. @@ -10042,13 +10052,10 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, if (isImpliedViaGuard(Block, Pred, LHS, RHS)) return true; if (ProvingStrictComparison) { - if (!ProvedNonStrictComparison) - ProvedNonStrictComparison = - isImpliedViaGuard(Block, NonStrictPredicate, LHS, RHS); - if (!ProvedNonEquality) - ProvedNonEquality = - isImpliedViaGuard(Block, ICmpInst::ICMP_NE, LHS, RHS); - if (ProvedNonStrictComparison && ProvedNonEquality) + auto ProofFn = [&](ICmpInst::Predicate P) { + return isImpliedViaGuard(Block, P, LHS, RHS); + }; + if (SplitAndProve(ProofFn)) return true; } return false; @@ -10060,13 +10067,10 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB, if (isImpliedCond(Pred, LHS, RHS, Condition, Inverse, Context)) return true; if (ProvingStrictComparison) { - if (!ProvedNonStrictComparison) - ProvedNonStrictComparison = isImpliedCond(NonStrictPredicate, LHS, RHS, - Condition, Inverse, Context); - if (!ProvedNonEquality) - ProvedNonEquality = isImpliedCond(ICmpInst::ICMP_NE, LHS, RHS, - Condition, Inverse, Context); - if (ProvedNonStrictComparison && ProvedNonEquality) + auto ProofFn = [&](ICmpInst::Predicate P) { + return isImpliedCond(P, LHS, RHS, Condition, Inverse, Context); + }; + if (SplitAndProve(ProofFn)) return true; } return false; -- GitLab From 6ca178cd78a99d682d0be43eff1a808c1bcf47e6 Mon Sep 17 00:00:00 2001 From: Emily Shi Date: Thu, 18 Mar 2021 19:14:40 -0700 Subject: [PATCH 0167/1000] [asan] specify c++ version in tests to fix compile error If we don't specify the c++ version in these tests, it could cause compile errors because the compiler could default to an older c++ rdar://75247244 Reviewed By: vitalybuka Differential Revision: https://reviews.llvm.org/D98913 --- compiler-rt/test/asan/TestCases/asan_update_allocation.cpp | 2 +- compiler-rt/test/asan/TestCases/lsan_crash.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/compiler-rt/test/asan/TestCases/asan_update_allocation.cpp b/compiler-rt/test/asan/TestCases/asan_update_allocation.cpp index 19f8073e0509..988a4f49f00e 100644 --- a/compiler-rt/test/asan/TestCases/asan_update_allocation.cpp +++ b/compiler-rt/test/asan/TestCases/asan_update_allocation.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx_asan -O0 %s -o %t +// RUN: %clangxx_asan -O0 %s --std=c++11 -o %t // RUN: not %run %t 10 0 2>&1 | FileCheck %s --check-prefixes=CHECK,T0 // RUN: not %run %t 10000000 0 2>&1 | FileCheck %s --check-prefixes=CHECK,T0 diff --git a/compiler-rt/test/asan/TestCases/lsan_crash.cpp b/compiler-rt/test/asan/TestCases/lsan_crash.cpp index 23c2569a0b73..09eddfde1373 100644 --- a/compiler-rt/test/asan/TestCases/lsan_crash.cpp +++ b/compiler-rt/test/asan/TestCases/lsan_crash.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx_asan -O2 %s -o %t && %run %t +// RUN: %clangxx_asan -O2 %s --std=c++11 -o %t && %run %t #include #include -- GitLab From d399b82e2ab26b38745852534e85771dee4de296 Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Fri, 19 Mar 2021 10:17:06 +0000 Subject: [PATCH 0168/1000] [RISCV] Maintain fixed-length info when optimizing BUILD_VECTORs I'm not sure how I failed to notice this before, but when optimizing dominant-element BUILD_VECTORs we would lower via the scalable container type, which lost us the information about the fixed length of the vector types. By lowering via the fixed-length type we can preserve that information and eliminate redundant vsetvli instructions. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D98938 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 8 +- .../CodeGen/RISCV/rvv/fixed-vectors-bswap.ll | 7 - .../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll | 7 - .../CodeGen/RISCV/rvv/fixed-vectors-cttz.ll | 7 - .../RISCV/rvv/fixed-vectors-fp-buildvec.ll | 21 +- .../CodeGen/RISCV/rvv/fixed-vectors-insert.ll | 1 - .../RISCV/rvv/fixed-vectors-int-buildvec.ll | 7 +- .../CodeGen/RISCV/rvv/fixed-vectors-int.ll | 261 ++++++++---------- 8 files changed, 124 insertions(+), 195 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6ddad93bc2dd..cd47d65d50a8 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1195,20 +1195,18 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, // Don't perform this optimization when optimizing for size, since // materializing elements and inserting them tends to cause code bloat. if (DominantValue && !DAG.shouldOptForSize()) { - unsigned Opc = - VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL : RISCVISD::VMV_V_X_VL; - SDValue Vec = DAG.getNode(Opc, DL, ContainerVT, DominantValue, VL); + SDValue Vec = DAG.getSplatBuildVector(VT, DL, DominantValue); if (ValueCounts.size() != 1) { MVT XLenVT = Subtarget.getXLenVT(); for (unsigned I = 0; I < NumElts; ++I) { if (!Op.getOperand(I).isUndef() && Op.getOperand(I) != DominantValue) - Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Vec, + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, Op.getOperand(I), DAG.getConstant(I, DL, XLenVT)); } } - return convertFromScalableVector(VT, Vec, DAG, Subtarget); + return Vec; } return SDValue(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll index 1b1a8e649adb..bb5dbda70eb0 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap.ll @@ -670,9 +670,7 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV64-NEXT: or a1, a2, a1 ; LMULMAX2-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vmv.v.x v25, a1 -; LMULMAX2-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vmv.s.x v25, t1 -; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vse64.v v25, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -801,9 +799,7 @@ define void @bswap_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV64-NEXT: or a1, a2, a1 ; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.v.x v25, a1 -; LMULMAX1-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v25, t1 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x @@ -2255,7 +2251,6 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: or a1, a2, a1 ; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.v.x v26, a1 -; LMULMAX1-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v26, t4 ; LMULMAX1-RV64-NEXT: vmv.x.s a1, v25 ; LMULMAX1-RV64-NEXT: srli a2, a1, 40 @@ -2305,9 +2300,7 @@ define void @bswap_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: or a1, a2, a1 ; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.v.x v25, a1 -; LMULMAX1-RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v25, t4 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) ; LMULMAX1-RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll index cb48a7a7b236..561b01828120 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll @@ -3878,9 +3878,7 @@ define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vmv.s.x v26, a1 -; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -4113,9 +4111,7 @@ define void @ctlz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x @@ -11882,7 +11878,6 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 ; LMULMAX1-RV64-NEXT: vsetivli a1, 1, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v27, v25, 1 @@ -11940,9 +11935,7 @@ define void @ctlz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v27, a1 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vse64.v v27, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v26, (a6) ; LMULMAX1-RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll index d6c3aba0be8b..e18c38e8ce30 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-cttz.ll @@ -2706,9 +2706,7 @@ define void @cttz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX2-RV64-NEXT: and a1, a1, a4 ; LMULMAX2-RV64-NEXT: mul a1, a1, a5 ; LMULMAX2-RV64-NEXT: srli a1, a1, 56 -; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vmv.s.x v26, a1 -; LMULMAX2-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -2889,9 +2887,7 @@ define void @cttz_v2i64(<2 x i64>* %x, <2 x i64>* %y) { ; LMULMAX1-RV64-NEXT: and a1, a1, a4 ; LMULMAX1-RV64-NEXT: mul a1, a1, a5 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX1-RV64-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x @@ -8230,7 +8226,6 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a2, a2, a1 ; LMULMAX1-RV64-NEXT: srli a2, a2, 56 -; LMULMAX1-RV64-NEXT: vsetvli a4, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 ; LMULMAX1-RV64-NEXT: vsetivli a2, 1, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vslidedown.vi v26, v25, 1 @@ -8268,9 +8263,7 @@ define void @cttz_v4i64(<4 x i64>* %x, <4 x i64>* %y) { ; LMULMAX1-RV64-NEXT: and a2, a2, a5 ; LMULMAX1-RV64-NEXT: mul a1, a2, a1 ; LMULMAX1-RV64-NEXT: srli a1, a1, 56 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v26, a1 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX1-RV64-NEXT: vse64.v v27, (a6) ; LMULMAX1-RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll index b8085f0bc618..a48323916e1a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -23,13 +23,12 @@ define void @buildvec_no_vid_v4f32(<4 x float>* %x) { define void @buildvec_dominant0_v4f32(<4 x float>* %x) { ; CHECK-LABEL: buildvec_dominant0_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: fmv.w.x ft0, zero ; CHECK-NEXT: lui a1, %hi(.LCPI1_0) -; CHECK-NEXT: flw ft1, %lo(.LCPI1_0)(a1) -; CHECK-NEXT: vsetvli a1, zero, e32,m1,ta,mu -; CHECK-NEXT: vfmv.s.f v25, ft0 +; CHECK-NEXT: flw ft0, %lo(.LCPI1_0)(a1) +; CHECK-NEXT: fmv.w.x ft1, zero ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vfmv.v.f v26, ft1 +; CHECK-NEXT: vfmv.s.f v25, ft1 +; CHECK-NEXT: vfmv.v.f v26, ft0 ; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu ; CHECK-NEXT: vslideup.vi v26, v25, 2 ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu @@ -43,9 +42,8 @@ define void @buildvec_dominant1_v4f32(<4 x float>* %x, float %f) { ; CHECK-LABEL: buildvec_dominant1_v4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: fmv.w.x ft0, zero -; CHECK-NEXT: vsetvli a1, zero, e32,m1,ta,mu -; CHECK-NEXT: vfmv.s.f v25, ft0 ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; CHECK-NEXT: vfmv.s.f v25, ft0 ; CHECK-NEXT: vfmv.v.f v26, fa0 ; CHECK-NEXT: vsetivli a1, 2, e32,m1,tu,mu ; CHECK-NEXT: vslideup.vi v26, v25, 1 @@ -66,13 +64,12 @@ define void @buildvec_dominant2_v4f32(<4 x float>* %x, float %f) { ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) ; CHECK-NEXT: flw ft0, %lo(.LCPI3_0)(a1) ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vfmv.v.f v25, fa0 -; CHECK-NEXT: vsetvli a1, zero, e32,m1,ta,mu -; CHECK-NEXT: vfmv.s.f v26, ft0 +; CHECK-NEXT: vfmv.s.f v25, ft0 +; CHECK-NEXT: vfmv.v.f v26, fa0 ; CHECK-NEXT: vsetivli a1, 2, e32,m1,tu,mu -; CHECK-NEXT: vslideup.vi v25, v26, 1 +; CHECK-NEXT: vslideup.vi v26, v25, 1 ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vse32.v v25, (a0) +; CHECK-NEXT: vse32.v v26, (a0) ; CHECK-NEXT: ret %v0 = insertelement <4 x float> undef, float %f, i32 0 %v1 = insertelement <4 x float> %v0, float 2.0, i32 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll index 19b3ef6defff..43626ca3f5dd 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -53,7 +53,6 @@ define void @insertelt_v3i64(<3 x i64>* %x, i64 %y) { ; RV32-NEXT: vsetivli a4, 4, e32,m1,ta,mu ; RV32-NEXT: lw a4, 16(a0) ; RV32-NEXT: vmv.v.x v26, a3 -; RV32-NEXT: vsetvli a3, zero, e32,m1,ta,mu ; RV32-NEXT: vmv.s.x v26, a4 ; RV32-NEXT: vsetivli a3, 4, e64,m2,tu,mu ; RV32-NEXT: vslideup.vi v28, v26, 2 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll index fecac9000096..7abea8116cbe 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -69,9 +69,8 @@ define void @buildvec_vid_mpy_imm_v16i8(<16 x i8>* %x) { define void @buildvec_dominant0_v8i16(<8 x i16>* %x) { ; CHECK-LABEL: buildvec_dominant0_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e16,m1,ta,mu -; CHECK-NEXT: vmv.s.x v25, zero ; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; CHECK-NEXT: vmv.s.x v25, zero ; CHECK-NEXT: vmv.v.i v26, 8 ; CHECK-NEXT: vsetivli a1, 4, e16,m1,tu,mu ; CHECK-NEXT: vslideup.vi v26, v25, 3 @@ -117,9 +116,7 @@ define void @buildvec_dominant2_v2i8(<2 x i8>* %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli a1, 2, e8,m1,ta,mu ; CHECK-NEXT: vmv.v.i v25, -1 -; CHECK-NEXT: vsetvli a1, zero, e8,m1,ta,mu ; CHECK-NEXT: vmv.s.x v25, zero -; CHECK-NEXT: vsetivli a1, 2, e8,m1,ta,mu ; CHECK-NEXT: vse8.v v25, (a0) ; CHECK-NEXT: ret store <2 x i8> , <2 x i8>* %x @@ -148,9 +145,7 @@ define void @buildvec_dominant0_v2i32(<2 x i64>* %x) { ; RV64-NEXT: addi a1, a1, -455 ; RV64-NEXT: slli a1, a1, 13 ; RV64-NEXT: addi a1, a1, -910 -; RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; RV64-NEXT: vmv.s.x v25, a1 -; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; RV64-NEXT: vse64.v v25, (a0) ; RV64-NEXT: ret store <2 x i64> , <2 x i64>* %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll index 33f2e0d3998e..84784ee82c1c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -872,28 +872,24 @@ define void @mulhu_v8i16(<8 x i16>* %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu ; CHECK-NEXT: vle16.v v25, (a0) +; CHECK-NEXT: vmv.v.i v26, 0 +; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vmv1r.v v27, v26 +; CHECK-NEXT: vmv.s.x v27, a1 ; CHECK-NEXT: addi a1, zero, 1 -; CHECK-NEXT: vsetvli a2, zero, e16,m1,ta,mu -; CHECK-NEXT: vmv.s.x v26, a1 -; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu -; CHECK-NEXT: vmv.v.i v27, 0 +; CHECK-NEXT: vmv.s.x v28, a1 ; CHECK-NEXT: vsetivli a1, 7, e16,m1,tu,mu -; CHECK-NEXT: vmv1r.v v28, v27 -; CHECK-NEXT: vslideup.vi v28, v26, 6 +; CHECK-NEXT: vslideup.vi v26, v28, 6 ; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu ; CHECK-NEXT: lui a1, %hi(.LCPI53_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_0) -; CHECK-NEXT: vle16.v v26, (a1) -; CHECK-NEXT: vsrl.vv v28, v25, v28 -; CHECK-NEXT: vmulhu.vv v26, v28, v26 -; CHECK-NEXT: vsub.vv v25, v25, v26 -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: vsetvli a2, zero, e16,m1,ta,mu -; CHECK-NEXT: vmv.s.x v27, a1 -; CHECK-NEXT: vsetivli a1, 8, e16,m1,ta,mu +; CHECK-NEXT: vle16.v v28, (a1) +; CHECK-NEXT: vsrl.vv v26, v25, v26 +; CHECK-NEXT: vmulhu.vv v26, v26, v28 ; CHECK-NEXT: lui a1, %hi(.LCPI53_1) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI53_1) ; CHECK-NEXT: vle16.v v28, (a1) +; CHECK-NEXT: vsub.vv v25, v25, v26 ; CHECK-NEXT: vmulhu.vv v25, v25, v27 ; CHECK-NEXT: vadd.vv v25, v25, v26 ; CHECK-NEXT: vsrl.vv v25, v25, v28 @@ -910,25 +906,21 @@ define void @mulhu_v4i32(<4 x i32>* %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu ; CHECK-NEXT: vle32.v v25, (a0) +; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: vmv.s.x v26, a1 +; CHECK-NEXT: vmv.v.i v27, 0 +; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu +; CHECK-NEXT: vslideup.vi v27, v26, 2 ; CHECK-NEXT: lui a1, %hi(.LCPI54_0) ; CHECK-NEXT: addi a1, a1, %lo(.LCPI54_0) +; CHECK-NEXT: vsetivli a2, 4, e32,m1,ta,mu ; CHECK-NEXT: vle32.v v26, (a1) ; CHECK-NEXT: vmulhu.vv v26, v25, v26 ; CHECK-NEXT: vsub.vv v25, v25, v26 -; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: vsetvli a2, zero, e32,m1,ta,mu -; CHECK-NEXT: vmv.s.x v27, a1 -; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vmv.v.i v28, 0 -; CHECK-NEXT: vsetivli a1, 3, e32,m1,tu,mu -; CHECK-NEXT: vslideup.vi v28, v27, 2 -; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu -; CHECK-NEXT: vmulhu.vv v25, v25, v28 +; CHECK-NEXT: vmulhu.vv v25, v25, v27 ; CHECK-NEXT: vadd.vv v25, v25, v26 ; CHECK-NEXT: addi a1, zero, 1 -; CHECK-NEXT: vsetvli a2, zero, e32,m1,ta,mu ; CHECK-NEXT: vmv.s.x v26, a1 -; CHECK-NEXT: vsetivli a1, 4, e32,m1,ta,mu ; CHECK-NEXT: vmv.v.i v27, 2 ; CHECK-NEXT: vsetivli a1, 4, e32,m1,tu,mu ; CHECK-NEXT: vslideup.vi v27, v26, 3 @@ -966,6 +958,9 @@ define void @mulhu_v2i64(<2 x i64>* %x) { ; RV64: # %bb.0: ; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; RV64-NEXT: vle64.v v25, (a0) +; RV64-NEXT: vmv.v.i v26, 2 +; RV64-NEXT: addi a1, zero, 1 +; RV64-NEXT: vmv.s.x v26, a1 ; RV64-NEXT: lui a1, 1035469 ; RV64-NEXT: addiw a1, a1, -819 ; RV64-NEXT: slli a1, a1, 12 @@ -974,7 +969,7 @@ define void @mulhu_v2i64(<2 x i64>* %x) { ; RV64-NEXT: addi a1, a1, -819 ; RV64-NEXT: slli a1, a1, 12 ; RV64-NEXT: addi a1, a1, -819 -; RV64-NEXT: vmv.v.x v26, a1 +; RV64-NEXT: vmv.v.x v27, a1 ; RV64-NEXT: lui a1, 1026731 ; RV64-NEXT: addiw a1, a1, -1365 ; RV64-NEXT: slli a1, a1, 12 @@ -983,15 +978,8 @@ define void @mulhu_v2i64(<2 x i64>* %x) { ; RV64-NEXT: addi a1, a1, -1365 ; RV64-NEXT: slli a1, a1, 12 ; RV64-NEXT: addi a1, a1, -1365 -; RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu -; RV64-NEXT: vmv.s.x v26, a1 -; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; RV64-NEXT: vmulhu.vv v25, v25, v26 -; RV64-NEXT: vmv.v.i v26, 2 -; RV64-NEXT: addi a1, zero, 1 -; RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu -; RV64-NEXT: vmv.s.x v26, a1 -; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; RV64-NEXT: vmv.s.x v27, a1 +; RV64-NEXT: vmulhu.vv v25, v25, v27 ; RV64-NEXT: vsrl.vv v25, v25, v26 ; RV64-NEXT: vse64.v v25, (a0) ; RV64-NEXT: ret @@ -1092,7 +1080,6 @@ define void @mulhs_v2i64(<2 x i64>* %x) { ; RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu ; RV32-NEXT: vmv.v.x v27, a2 ; RV32-NEXT: addi a1, a1, 1366 -; RV32-NEXT: vsetvli a2, zero, e32,m1,ta,mu ; RV32-NEXT: vmv.s.x v27, a1 ; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; RV32-NEXT: vmulh.vv v25, v25, v27 @@ -1104,9 +1091,8 @@ define void @mulhs_v2i64(<2 x i64>* %x) { ; RV32-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; RV32-NEXT: vsrl.vv v26, v25, v26 ; RV32-NEXT: addi a1, zero, 1 -; RV32-NEXT: vsetvli a2, zero, e32,m1,ta,mu +; RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu ; RV32-NEXT: vmv.s.x v27, a1 -; RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu ; RV32-NEXT: vmv.v.i v28, 0 ; RV32-NEXT: vsetivli a1, 3, e32,m1,tu,mu ; RV32-NEXT: vslideup.vi v28, v27, 2 @@ -1121,9 +1107,7 @@ define void @mulhs_v2i64(<2 x i64>* %x) { ; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; RV64-NEXT: vle64.v v25, (a0) ; RV64-NEXT: vmv.v.i v26, -1 -; RV64-NEXT: vsetvli a1, zero, e64,m1,ta,mu ; RV64-NEXT: vmv.s.x v26, zero -; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; RV64-NEXT: vmul.vv v26, v25, v26 ; RV64-NEXT: lui a1, 21845 ; RV64-NEXT: addiw a1, a1, 1365 @@ -1135,9 +1119,7 @@ define void @mulhs_v2i64(<2 x i64>* %x) { ; RV64-NEXT: addi a2, a1, 1365 ; RV64-NEXT: vmv.v.x v27, a2 ; RV64-NEXT: addi a1, a1, 1366 -; RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; RV64-NEXT: vmv.s.x v27, a1 -; RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu ; RV64-NEXT: vmulh.vv v25, v25, v27 ; RV64-NEXT: vadd.vv v25, v25, v26 ; RV64-NEXT: addi a1, zero, 63 @@ -3983,40 +3965,36 @@ define void @mulhu_v8i32(<8 x i32>* %x) { ; LMULMAX1-RV32-LABEL: mulhu_v8i32: ; LMULMAX1-RV32: # %bb.0: ; LMULMAX1-RV32-NEXT: vsetivli a1, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v25, (a0) ; LMULMAX1-RV32-NEXT: addi a1, a0, 16 -; LMULMAX1-RV32-NEXT: vle32.v v25, (a1) -; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI131_0) -; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI131_0) -; LMULMAX1-RV32-NEXT: vle32.v v26, (a2) -; LMULMAX1-RV32-NEXT: vle32.v v27, (a0) -; LMULMAX1-RV32-NEXT: vmulhu.vv v28, v25, v26 -; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vle32.v v26, (a1) ; LMULMAX1-RV32-NEXT: lui a2, 524288 -; LMULMAX1-RV32-NEXT: vsetvli a3, zero, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.s.x v29, a2 -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.v.i v30, 0 +; LMULMAX1-RV32-NEXT: vmv.s.x v27, a2 +; LMULMAX1-RV32-NEXT: vmv.v.i v28, 0 ; LMULMAX1-RV32-NEXT: vsetivli a2, 3, e32,m1,tu,mu -; LMULMAX1-RV32-NEXT: vslideup.vi v30, v29, 2 -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v30 -; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vslideup.vi v28, v27, 2 +; LMULMAX1-RV32-NEXT: lui a2, %hi(.LCPI131_0) +; LMULMAX1-RV32-NEXT: addi a2, a2, %lo(.LCPI131_0) +; LMULMAX1-RV32-NEXT: vsetivli a3, 4, e32,m1,ta,mu +; LMULMAX1-RV32-NEXT: vle32.v v27, (a2) +; LMULMAX1-RV32-NEXT: vmulhu.vv v29, v26, v27 +; LMULMAX1-RV32-NEXT: vsub.vv v26, v26, v29 +; LMULMAX1-RV32-NEXT: vmulhu.vv v26, v26, v28 +; LMULMAX1-RV32-NEXT: vadd.vv v26, v26, v29 ; LMULMAX1-RV32-NEXT: addi a2, zero, 1 -; LMULMAX1-RV32-NEXT: vsetvli a3, zero, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.s.x v28, a2 -; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vmv.v.i v29, 2 +; LMULMAX1-RV32-NEXT: vmv.s.x v29, a2 +; LMULMAX1-RV32-NEXT: vmv.v.i v30, 2 ; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,tu,mu -; LMULMAX1-RV32-NEXT: vslideup.vi v29, v28, 3 +; LMULMAX1-RV32-NEXT: vslideup.vi v30, v29, 3 ; LMULMAX1-RV32-NEXT: vsetivli a2, 4, e32,m1,ta,mu -; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v29 -; LMULMAX1-RV32-NEXT: vmulhu.vv v26, v27, v26 -; LMULMAX1-RV32-NEXT: vsub.vv v27, v27, v26 -; LMULMAX1-RV32-NEXT: vmulhu.vv v27, v27, v30 -; LMULMAX1-RV32-NEXT: vadd.vv v26, v27, v26 -; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v29 -; LMULMAX1-RV32-NEXT: vse32.v v26, (a0) -; LMULMAX1-RV32-NEXT: vse32.v v25, (a1) +; LMULMAX1-RV32-NEXT: vsrl.vv v26, v26, v30 +; LMULMAX1-RV32-NEXT: vmulhu.vv v27, v25, v27 +; LMULMAX1-RV32-NEXT: vsub.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vmulhu.vv v25, v25, v28 +; LMULMAX1-RV32-NEXT: vadd.vv v25, v25, v27 +; LMULMAX1-RV32-NEXT: vsrl.vv v25, v25, v30 +; LMULMAX1-RV32-NEXT: vse32.v v25, (a0) +; LMULMAX1-RV32-NEXT: vse32.v v26, (a1) ; LMULMAX1-RV32-NEXT: ret ; ; LMULMAX1-RV64-LABEL: mulhu_v8i32: @@ -4052,9 +4030,8 @@ define void @mulhu_v4i64(<4 x i64>* %x) { ; LMULMAX2-RV32-NEXT: vmulhu.vv v28, v26, v28 ; LMULMAX2-RV32-NEXT: vsub.vv v26, v26, v28 ; LMULMAX2-RV32-NEXT: lui a1, 524288 -; LMULMAX2-RV32-NEXT: vsetvli a2, zero, e32,m2,ta,mu +; LMULMAX2-RV32-NEXT: vsetivli a2, 8, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vmv.s.x v30, a1 -; LMULMAX2-RV32-NEXT: vsetivli a1, 8, e32,m2,ta,mu ; LMULMAX2-RV32-NEXT: vmv.v.i v8, 0 ; LMULMAX2-RV32-NEXT: vsetivli a1, 6, e32,m2,tu,mu ; LMULMAX2-RV32-NEXT: vslideup.vi v8, v30, 5 @@ -4074,26 +4051,24 @@ define void @mulhu_v4i64(<4 x i64>* %x) { ; LMULMAX2-RV64: # %bb.0: ; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: vle64.v v26, (a0) +; LMULMAX2-RV64-NEXT: addi a1, zero, -1 +; LMULMAX2-RV64-NEXT: slli a1, a1, 63 +; LMULMAX2-RV64-NEXT: vmv.s.x v28, a1 +; LMULMAX2-RV64-NEXT: vmv.v.i v30, 0 +; LMULMAX2-RV64-NEXT: vsetivli a1, 3, e64,m2,tu,mu +; LMULMAX2-RV64-NEXT: vslideup.vi v30, v28, 2 ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI132_0) ; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI132_0) +; LMULMAX2-RV64-NEXT: vsetivli a2, 4, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: vle64.v v28, (a1) ; LMULMAX2-RV64-NEXT: vmulhu.vv v28, v26, v28 -; LMULMAX2-RV64-NEXT: vsub.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: addi a1, zero, -1 -; LMULMAX2-RV64-NEXT: slli a1, a1, 63 -; LMULMAX2-RV64-NEXT: vsetvli a2, zero, e64,m2,ta,mu -; LMULMAX2-RV64-NEXT: vmv.s.x v30, a1 -; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu -; LMULMAX2-RV64-NEXT: vmv.v.i v8, 0 -; LMULMAX2-RV64-NEXT: vsetivli a1, 3, e64,m2,tu,mu -; LMULMAX2-RV64-NEXT: vslideup.vi v8, v30, 2 -; LMULMAX2-RV64-NEXT: vsetivli a1, 4, e64,m2,ta,mu ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI132_1) ; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI132_1) -; LMULMAX2-RV64-NEXT: vle64.v v30, (a1) -; LMULMAX2-RV64-NEXT: vmulhu.vv v26, v26, v8 +; LMULMAX2-RV64-NEXT: vle64.v v8, (a1) +; LMULMAX2-RV64-NEXT: vsub.vv v26, v26, v28 +; LMULMAX2-RV64-NEXT: vmulhu.vv v26, v26, v30 ; LMULMAX2-RV64-NEXT: vadd.vv v26, v26, v28 -; LMULMAX2-RV64-NEXT: vsrl.vv v26, v26, v30 +; LMULMAX2-RV64-NEXT: vsrl.vv v26, v26, v8 ; LMULMAX2-RV64-NEXT: vse64.v v26, (a0) ; LMULMAX2-RV64-NEXT: ret ; @@ -4121,11 +4096,15 @@ define void @mulhu_v4i64(<4 x i64>* %x) { ; ; LMULMAX1-RV64-LABEL: mulhu_v4i64: ; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: addi a2, zero, 2 -; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, zero, 2 +; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) -; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) +; LMULMAX1-RV64-NEXT: addi a2, a0, 16 +; LMULMAX1-RV64-NEXT: vle64.v v26, (a2) +; LMULMAX1-RV64-NEXT: vmv.v.i v27, 0 +; LMULMAX1-RV64-NEXT: addi a3, zero, -1 +; LMULMAX1-RV64-NEXT: slli a3, a3, 63 +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a3 ; LMULMAX1-RV64-NEXT: lui a3, 1044935 ; LMULMAX1-RV64-NEXT: addiw a3, a3, 455 ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 @@ -4134,7 +4113,7 @@ define void @mulhu_v4i64(<4 x i64>* %x) { ; LMULMAX1-RV64-NEXT: addi a3, a3, 455 ; LMULMAX1-RV64-NEXT: slli a3, a3, 13 ; LMULMAX1-RV64-NEXT: addi a3, a3, 911 -; LMULMAX1-RV64-NEXT: vmv.v.x v27, a3 +; LMULMAX1-RV64-NEXT: vmv.v.x v28, a3 ; LMULMAX1-RV64-NEXT: lui a3, 4681 ; LMULMAX1-RV64-NEXT: addiw a3, a3, 585 ; LMULMAX1-RV64-NEXT: slli a3, a3, 12 @@ -4143,53 +4122,39 @@ define void @mulhu_v4i64(<4 x i64>* %x) { ; LMULMAX1-RV64-NEXT: addi a3, a3, 585 ; LMULMAX1-RV64-NEXT: slli a3, a3, 13 ; LMULMAX1-RV64-NEXT: addi a3, a3, 1171 -; LMULMAX1-RV64-NEXT: vsetvli a4, zero, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmv.s.x v27, a3 -; LMULMAX1-RV64-NEXT: vsetivli a3, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmulhu.vv v27, v26, v27 -; LMULMAX1-RV64-NEXT: vsub.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: vmv.v.i v28, 0 -; LMULMAX1-RV64-NEXT: addi a3, zero, -1 -; LMULMAX1-RV64-NEXT: slli a3, a3, 63 -; LMULMAX1-RV64-NEXT: vsetvli a4, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v28, a3 -; LMULMAX1-RV64-NEXT: vsetivli a3, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmulhu.vv v26, v26, v28 -; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vmulhu.vv v28, v26, v28 +; LMULMAX1-RV64-NEXT: vsub.vv v26, v26, v28 +; LMULMAX1-RV64-NEXT: vmulhu.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v28 ; LMULMAX1-RV64-NEXT: vmv.v.i v27, 3 -; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 -; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a1 ; LMULMAX1-RV64-NEXT: vsrl.vv v26, v26, v27 -; LMULMAX1-RV64-NEXT: lui a2, 1035469 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -819 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, -819 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, -819 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, -819 -; LMULMAX1-RV64-NEXT: vmv.v.x v27, a2 -; LMULMAX1-RV64-NEXT: lui a2, 1026731 -; LMULMAX1-RV64-NEXT: addiw a2, a2, -1365 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, -1365 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, -1365 -; LMULMAX1-RV64-NEXT: slli a2, a2, 12 -; LMULMAX1-RV64-NEXT: addi a2, a2, -1365 -; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 -; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmulhu.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: vmv.v.i v27, 2 -; LMULMAX1-RV64-NEXT: addi a2, zero, 1 -; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmv.s.x v27, a2 -; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu +; LMULMAX1-RV64-NEXT: addi a1, zero, 1 +; LMULMAX1-RV64-NEXT: vmv.s.x v27, a1 +; LMULMAX1-RV64-NEXT: lui a1, 1035469 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -819 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -819 +; LMULMAX1-RV64-NEXT: vmv.v.x v28, a1 +; LMULMAX1-RV64-NEXT: lui a1, 1026731 +; LMULMAX1-RV64-NEXT: addiw a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: slli a1, a1, 12 +; LMULMAX1-RV64-NEXT: addi a1, a1, -1365 +; LMULMAX1-RV64-NEXT: vmv.s.x v28, a1 +; LMULMAX1-RV64-NEXT: vmulhu.vv v25, v25, v28 ; LMULMAX1-RV64-NEXT: vsrl.vv v25, v25, v27 ; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v26, (a1) +; LMULMAX1-RV64-NEXT: vse64.v v26, (a2) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = udiv <4 x i64> %a, @@ -4416,14 +4381,12 @@ define void @mulhs_v4i64(<4 x i64>* %x) { ; LMULMAX1-RV64-LABEL: mulhs_v4i64: ; LMULMAX1-RV64: # %bb.0: ; LMULMAX1-RV64-NEXT: vsetivli a1, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vle64.v v25, (a0) ; LMULMAX1-RV64-NEXT: addi a1, a0, 16 -; LMULMAX1-RV64-NEXT: vle64.v v26, (a1) +; LMULMAX1-RV64-NEXT: vle64.v v25, (a1) +; LMULMAX1-RV64-NEXT: vle64.v v26, (a0) ; LMULMAX1-RV64-NEXT: vmv.v.i v27, -1 -; LMULMAX1-RV64-NEXT: vsetvli a2, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v27, zero -; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmul.vv v28, v26, v27 +; LMULMAX1-RV64-NEXT: vmul.vv v28, v25, v27 ; LMULMAX1-RV64-NEXT: lui a2, 21845 ; LMULMAX1-RV64-NEXT: addiw a2, a2, 1365 ; LMULMAX1-RV64-NEXT: slli a2, a2, 12 @@ -4434,24 +4397,22 @@ define void @mulhs_v4i64(<4 x i64>* %x) { ; LMULMAX1-RV64-NEXT: addi a3, a2, 1365 ; LMULMAX1-RV64-NEXT: vmv.v.x v29, a3 ; LMULMAX1-RV64-NEXT: addi a2, a2, 1366 -; LMULMAX1-RV64-NEXT: vsetvli a3, zero, e64,m1,ta,mu ; LMULMAX1-RV64-NEXT: vmv.s.x v29, a2 -; LMULMAX1-RV64-NEXT: vsetivli a2, 2, e64,m1,ta,mu -; LMULMAX1-RV64-NEXT: vmulh.vv v26, v26, v29 -; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v28 +; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v29 +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v28 ; LMULMAX1-RV64-NEXT: addi a2, zero, 63 -; LMULMAX1-RV64-NEXT: vsrl.vx v28, v26, a2 +; LMULMAX1-RV64-NEXT: vsrl.vx v28, v25, a2 ; LMULMAX1-RV64-NEXT: vid.v v30 -; LMULMAX1-RV64-NEXT: vsra.vv v26, v26, v30 -; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v28 -; LMULMAX1-RV64-NEXT: vmul.vv v27, v25, v27 -; LMULMAX1-RV64-NEXT: vmulh.vv v25, v25, v29 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vsrl.vx v27, v25, a2 ; LMULMAX1-RV64-NEXT: vsra.vv v25, v25, v30 -; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v27 -; LMULMAX1-RV64-NEXT: vse64.v v25, (a0) -; LMULMAX1-RV64-NEXT: vse64.v v26, (a1) +; LMULMAX1-RV64-NEXT: vadd.vv v25, v25, v28 +; LMULMAX1-RV64-NEXT: vmul.vv v27, v26, v27 +; LMULMAX1-RV64-NEXT: vmulh.vv v26, v26, v29 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vsrl.vx v27, v26, a2 +; LMULMAX1-RV64-NEXT: vsra.vv v26, v26, v30 +; LMULMAX1-RV64-NEXT: vadd.vv v26, v26, v27 +; LMULMAX1-RV64-NEXT: vse64.v v26, (a0) +; LMULMAX1-RV64-NEXT: vse64.v v25, (a1) ; LMULMAX1-RV64-NEXT: ret %a = load <4 x i64>, <4 x i64>* %x %b = sdiv <4 x i64> %a, -- GitLab From 3587728ed5d4c7cc036ea4f93ed8867951db4393 Mon Sep 17 00:00:00 2001 From: thomasraoux Date: Fri, 19 Mar 2021 10:32:23 -0700 Subject: [PATCH 0169/1000] [mlir] Fix cuda integration test failure --- mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir index c4ad89778d97..da3991fb1c73 100644 --- a/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir +++ b/mlir/test/Integration/GPU/CUDA/gpu-to-cubin.mlir @@ -5,6 +5,7 @@ // RUN: | mlir-cpu-runner \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_cuda_runtime%shlibext \ // RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_c_runner_utils%shlibext \ // RUN: --entry-point-result=void \ // RUN: | FileCheck %s -- GitLab From 85f3f6b3cc2969fa0e7b38209dfe02354f7153dd Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 19 Mar 2021 10:39:33 -0700 Subject: [PATCH 0170/1000] [RISCV] Lower scalable vector masked loads to intrinsics to match fixed vectors and reduce isel patterns. Reviewed By: frasercrmck Differential Revision: https://reviews.llvm.org/D98840 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 61 ++++++++++++------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 6 +- .../Target/RISCV/RISCVInstrInfoVSDPatterns.td | 37 ----------- 3 files changed, 41 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index cd47d65d50a8..6dfc2d46afe1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -474,6 +474,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); @@ -517,6 +519,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom); setOperationAction(ISD::FCOPYSIGN, VT, Legal); + setOperationAction(ISD::MLOAD, VT, Custom); + setOperationAction(ISD::MSTORE, VT, Custom); setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); @@ -1651,9 +1655,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, case ISD::STORE: return lowerFixedLengthVectorStoreToRVV(Op, DAG); case ISD::MLOAD: - return lowerFixedLengthVectorMaskedLoadToRVV(Op, DAG); + return lowerMLOAD(Op, DAG); case ISD::MSTORE: - return lowerFixedLengthVectorMaskedStoreToRVV(Op, DAG); + return lowerMSTORE(Op, DAG); case ISD::SETCC: return lowerFixedLengthVectorSetccToRVV(Op, DAG); case ISD::ADD: @@ -3194,50 +3198,63 @@ RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op, Store->getMemoryVT(), Store->getMemOperand()); } -SDValue RISCVTargetLowering::lowerFixedLengthVectorMaskedLoadToRVV( - SDValue Op, SelectionDAG &DAG) const { +SDValue RISCVTargetLowering::lowerMLOAD(SDValue Op, SelectionDAG &DAG) const { auto *Load = cast(Op); SDLoc DL(Op); MVT VT = Op.getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(VT); - MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); MVT XLenVT = Subtarget.getXLenVT(); - SDValue Mask = - convertToScalableVector(MaskVT, Load->getMask(), DAG, Subtarget); - SDValue PassThru = - convertToScalableVector(ContainerVT, Load->getPassThru(), DAG, Subtarget); - SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + SDValue Mask = Load->getMask(); + SDValue PassThru = Load->getPassThru(); + SDValue VL; + + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); + + Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); + PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget); + VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + } else + VL = DAG.getRegister(RISCV::X0, XLenVT); SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vle_mask, DL, XLenVT); SDValue Ops[] = {Load->getChain(), IntID, PassThru, Load->getBasePtr(), Mask, VL}; - SDValue NewLoad = + SDValue Result = DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, Load->getMemoryVT(), Load->getMemOperand()); + SDValue Chain = Result.getValue(1); - SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget); - return DAG.getMergeValues({Result, NewLoad.getValue(1)}, DL); + if (VT.isFixedLengthVector()) + Result = convertFromScalableVector(VT, Result, DAG, Subtarget); + + return DAG.getMergeValues({Result, Chain}, DL); } -SDValue RISCVTargetLowering::lowerFixedLengthVectorMaskedStoreToRVV( - SDValue Op, SelectionDAG &DAG) const { +SDValue RISCVTargetLowering::lowerMSTORE(SDValue Op, SelectionDAG &DAG) const { auto *Store = cast(Op); SDLoc DL(Op); SDValue Val = Store->getValue(); + SDValue Mask = Store->getMask(); MVT VT = Val.getSimpleValueType(); - MVT ContainerVT = getContainerForFixedLengthVector(VT); - MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); MVT XLenVT = Subtarget.getXLenVT(); + SDValue VL; - Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget); - SDValue Mask = - convertToScalableVector(MaskVT, Store->getMask(), DAG, Subtarget); + MVT ContainerVT = VT; + if (VT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VT); + MVT MaskVT = MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); - SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget); + Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); + VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + } else + VL = DAG.getRegister(RISCV::X0, XLenVT); SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vse_mask, DL, XLenVT); return DAG.getMemIntrinsicNode( diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 35fdf2921e22..4546ee4d0f89 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -475,15 +475,13 @@ private: SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_REVERSE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerABS(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerMLOAD(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorFCOPYSIGNToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerMGATHERMSCATTER(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const; - SDValue lowerFixedLengthVectorMaskedLoadToRVV(SDValue Op, - SelectionDAG &DAG) const; - SDValue lowerFixedLengthVectorMaskedStoreToRVV(SDValue Op, - SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorSetccToRVV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFixedLengthVectorLogicOpToRVV(SDValue Op, SelectionDAG &DAG, unsigned MaskOpc, diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td index d847296e7e25..eaa404fa3be8 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVSDPatterns.td @@ -33,21 +33,6 @@ def SplatPat : ComplexPattern; def SplatPat_uimm5 : ComplexPattern; -def masked_load : - PatFrag<(ops node:$ptr, node:$mask, node:$maskedoff), - (masked_ld node:$ptr, undef, node:$mask, node:$maskedoff), [{ - return !cast(N)->isExpandingLoad() && - cast(N)->getExtensionType() == ISD::NON_EXTLOAD && - cast(N)->isUnindexed(); -}]>; -def masked_store : - PatFrag<(ops node:$val, node:$ptr, node:$mask), - (masked_st node:$val, node:$ptr, undef, node:$mask), [{ - return !cast(N)->isTruncatingStore() && - !cast(N)->isCompressingStore() && - cast(N)->isUnindexed(); -}]>; - class SwapHelper { dag Value = !con(Prefix, !if(swap, B, A), !if(swap, A, B), Suffix); } @@ -68,25 +53,6 @@ multiclass VPatUSLoadStoreSDNode; } -multiclass VPatUSLoadStoreSDNodeMask -{ - defvar load_instr = !cast("PseudoVLE"#sew#"_V_"#vlmul.MX#"_MASK"); - defvar store_instr = !cast("PseudoVSE"#sew#"_V_"#vlmul.MX#"_MASK"); - // Load - def : Pat<(type (masked_load BaseAddr:$rs1, (mask_type V0), type:$merge)), - (load_instr reg_class:$merge, BaseAddr:$rs1, (mask_type V0), - avl, sew)>; - // Store - def : Pat<(masked_store type:$rs2, BaseAddr:$rs1, (mask_type V0)), - (store_instr reg_class:$rs2, BaseAddr:$rs1, (mask_type V0), - avl, sew)>; -} - multiclass VPatUSLoadStoreWholeVRSDNode; -foreach vti = AllVectors in - defm "" : VPatUSLoadStoreSDNodeMask; foreach vti = [VI8M1, VI16M1, VI32M1, VI64M1, VF16M1, VF32M1, VF64M1] in defm "" : VPatUSLoadStoreWholeVRSDNode; -- GitLab From 93a9d2de8f4f73b5785d539db4dfa3fb5bbffedc Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Thu, 18 Mar 2021 11:32:34 -0700 Subject: [PATCH 0171/1000] [VPlan] Add plain text (not DOT's digraph) dumps I foresee two uses for this: 1) It's easier to use those in debugger. 2) Once we start implementing more VPlan-to-VPlan transformations (especially inner loop massaging stuff), using the vectorized LLVM IR as CHECK targets in LIT test would become too obscure. I can imagine that we'd want to CHECK against VPlan dumps after multiple transformations instead. That would be easier with plain text dumps than with DOT format. Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D96628 --- .../Vectorize/LoopVectorizationPlanner.h | 5 +- .../Transforms/Vectorize/LoopVectorize.cpp | 16 +- llvm/lib/Transforms/Vectorize/VPlan.cpp | 140 ++++++++++++------ llvm/lib/Transforms/Vectorize/VPlan.h | 73 ++++++--- .../Transforms/LoopVectorize/icmp-uniforms.ll | 13 +- .../LoopVectorize/vplan-dot-printing.ll | 40 +++++ .../LoopVectorize/vplan-printing.ll | 129 ++++++++-------- .../Transforms/Vectorize/VPlanHCFGTest.cpp | 30 ++-- .../Transforms/Vectorize/VPlanTest.cpp | 43 +++++- 9 files changed, 328 insertions(+), 161 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 1f8d5c8aa195..fae75e318b42 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -256,10 +256,7 @@ public: /// best selected VPlan. void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); - void printPlans(raw_ostream &O) { - for (const auto &Plan : VPlans) - O << *Plan; - } + void printPlans(raw_ostream &O); /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6e310fb1ba95..61b6fa1bcc63 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -360,6 +360,10 @@ cl::opt llvm::EnableLoopVectorization( "vectorize-loops", cl::init(true), cl::Hidden, cl::desc("Run the Loop vectorization passes")); +cl::opt PrintVPlansInDotFormat( + "vplan-print-in-dot-format", cl::init(false), cl::Hidden, + cl::desc("Use dot format instead of plain text when dumping VPlans")); + /// A helper function that returns the type of loaded or stored value. static Type *getMemInstValueType(Value *I) { assert((isa(I) || isa(I)) && @@ -7809,6 +7813,14 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, ILV.printDebugTracesAtEnd(); } +void LoopVectorizationPlanner::printPlans(raw_ostream &O) { + for (const auto &Plan : VPlans) + if (PrintVPlansInDotFormat) + Plan->printDOT(O); + else + Plan->print(O); +} + void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl &DeadInstructions) { @@ -9007,7 +9019,7 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { - O << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; + O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; IG->getInsertPos()->printAsOperand(O, false); O << ", "; getAddr()->printAsOperand(O, SlotTracker); @@ -9018,7 +9030,7 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, } for (unsigned i = 0; i < IG->getFactor(); ++i) if (Instruction *I = IG->getMember(i)) - O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i; + O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; } void VPWidenCallRecipe::execute(VPTransformState &State) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 6974502bad70..d8df4a710d88 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -399,6 +399,42 @@ void VPBasicBlock::dropAllReferences(VPValue *NewValue) { } } +void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << getName() << ":\n"; + if (const VPValue *Pred = getPredicate()) { + O << Indent << "BlockPredicate:"; + Pred->printAsOperand(O, SlotTracker); + if (const auto *PredInst = dyn_cast(Pred)) + O << " (" << PredInst->getParent()->getName() << ")"; + O << '\n'; + } + + auto RecipeIndent = Indent + " "; + for (const VPRecipeBase &Recipe : *this) { + Recipe.print(O, RecipeIndent, SlotTracker); + O << '\n'; + } + + if (getSuccessors().empty()) { + O << Indent << "No successors\n"; + } else { + O << Indent << "Successor(s): "; + ListSeparator LS; + for (auto *Succ : getSuccessors()) + O << LS << Succ->getName(); + O << '\n'; + } + + if (const VPValue *CBV = getCondBit()) { + O << Indent << "CondBit: "; + CBV->printAsOperand(O, SlotTracker); + if (const auto *CBI = dyn_cast(CBV)) + O << " (" << CBI->getParent()->getName() << ")"; + O << '\n'; + } +} + void VPRegionBlock::dropAllReferences(VPValue *NewValue) { for (VPBlockBase *Block : depth_first(Entry)) // Drop all references in VPBasicBlocks and replace all uses with @@ -455,6 +491,17 @@ void VPRegionBlock::execute(VPTransformState *State) { State->Instance.reset(); } +void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << (isReplicator() ? " " : " ") << getName() << ": {"; + auto NewIndent = Indent + " "; + for (auto *BlockBase : depth_first(Entry)) { + O << '\n'; + BlockBase->print(O, NewIndent, SlotTracker); + } + O << Indent << "}\n"; +} + void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); assert(InsertPos->getParent() && @@ -683,9 +730,28 @@ void VPlan::execute(VPTransformState *State) { L->getExitBlock()); } +// TODO: Wrap those in #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)/#endif. +LLVM_DUMP_METHOD +void VPlan::print(raw_ostream &O) const { + VPSlotTracker SlotTracker(this); + + O << "VPlan {"; + for (const VPBlockBase *Block : depth_first(getEntry())) { + O << '\n'; + Block->print(O, "", SlotTracker); + } + O << "}\n"; +} + +LLVM_DUMP_METHOD +void VPlan::printDOT(raw_ostream &O) const { + VPlanPrinter Printer(O, *this); + Printer.dump(); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD -void VPlan::dump() const { dbgs() << *this << '\n'; } +void VPlan::dump() const { print(dbgs()); } #endif void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, @@ -804,46 +870,32 @@ void VPlanPrinter::dumpEdges(const VPBlockBase *Block) { } void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) { + // Implement dot-formatted dump by performing plain-text dump into the + // temporary storage followed by some post-processing. OS << Indent << getUID(BasicBlock) << " [label =\n"; bumpIndent(1); - OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\""; - bumpIndent(1); + std::string Str; + raw_string_ostream SS(Str); + // Use no indentation as we need to wrap the lines into quotes ourselves. + BasicBlock->print(SS, "", SlotTracker); - // Dump the block predicate. - const VPValue *Pred = BasicBlock->getPredicate(); - if (Pred) { - OS << " +\n" << Indent << " \"BlockPredicate: \""; - if (const VPInstruction *PredI = dyn_cast(Pred)) { - PredI->printAsOperand(OS, SlotTracker); - OS << " (" << DOT::EscapeString(PredI->getParent()->getName()) - << ")\\l\""; - } else - Pred->printAsOperand(OS, SlotTracker); - } + // We need to process each line of the output separately, so split + // single-string plain-text dump. + SmallVector Lines; + StringRef(Str).rtrim('\n').split(Lines, "\n"); - for (const VPRecipeBase &Recipe : *BasicBlock) { - OS << " +\n" << Indent << "\""; - // Don't indent inside the recipe printer as we printed it before the - // opening quote already. - Recipe.print(OS, "", SlotTracker); - OS << "\\l\""; - } + auto EmitLine = [&](StringRef Line, StringRef Suffix) { + OS << Indent << '"' << DOT::EscapeString(Line.str()) << "\\l\"" << Suffix; + }; - // Dump the condition bit. - const VPValue *CBV = BasicBlock->getCondBit(); - if (CBV) { - OS << " +\n" << Indent << " \"CondBit: "; - if (const VPInstruction *CBI = dyn_cast(CBV)) { - CBI->printAsOperand(OS, SlotTracker); - OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\""; - } else { - CBV->printAsOperand(OS, SlotTracker); - OS << "\""; - } - } + // Don't need the "+" after the last line. + for (auto Line : make_range(Lines.begin(), Lines.end() - 1)) + EmitLine(Line, " +\n"); + EmitLine(Lines.back(), "\n"); + + bumpIndent(-1); + OS << Indent << "]\n"; - bumpIndent(-2); - OS << "\n" << Indent << "]\n"; dumpEdges(BasicBlock); } @@ -863,25 +915,21 @@ void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) { dumpEdges(Region); } -void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) { - std::string IngredientString; - raw_string_ostream RSO(IngredientString); +void VPlanIngredient::print(raw_ostream &O) const { if (auto *Inst = dyn_cast(V)) { if (!Inst->getType()->isVoidTy()) { - Inst->printAsOperand(RSO, false); - RSO << " = "; + Inst->printAsOperand(O, false); + O << " = "; } - RSO << Inst->getOpcodeName() << " "; + O << Inst->getOpcodeName() << " "; unsigned E = Inst->getNumOperands(); if (E > 0) { - Inst->getOperand(0)->printAsOperand(RSO, false); + Inst->getOperand(0)->printAsOperand(O, false); for (unsigned I = 1; I < E; ++I) - Inst->getOperand(I)->printAsOperand(RSO << ", ", false); + Inst->getOperand(I)->printAsOperand(O << ", ", false); } } else // !Inst - V->printAsOperand(RSO, false); - RSO.flush(); - O << DOT::EscapeString(IngredientString); + V->printAsOperand(O, false); } void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 9b5d5d7e77be..5a98c63401b0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -577,12 +577,6 @@ public: OS << getName(); } - void print(raw_ostream &OS) const { - // TODO: Only printing VPBB name for now since we only have dot printing - // support for VPInstructions/Recipes. - printAsOperand(OS, false); - } - /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -593,6 +587,24 @@ public: /// Replace all operands of VPUsers in the block with \p NewValue and also /// replaces all uses of VPValues defined in the block with NewValue. virtual void dropAllReferences(VPValue *NewValue) = 0; + + /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines + /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using + /// consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual blocks is consistent with the whole VPlan printing. + virtual void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const = 0; + + /// Print plain-text dump of this VPlan to \p O. + void print(raw_ostream &O) const { + VPSlotTracker SlotTracker(getPlan()); + print(O, "", SlotTracker); + } + + /// Dump this VPBlockBase to dbgs(). + void dump() const { print(dbgs()); } }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR @@ -1246,12 +1258,11 @@ public: /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override { - O << " +\n" << Indent << "\"BRANCH-ON-MASK "; + O << Indent << "BRANCH-ON-MASK "; if (VPValue *Mask = getMask()) Mask->printAsOperand(O, SlotTracker); else O << " All-One"; - O << "\\l\""; } /// Return the mask used by this recipe. Note that a full mask is represented @@ -1463,6 +1474,15 @@ public: void dropAllReferences(VPValue *NewValue) override; + /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p + /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual blocks is consistent with the whole VPlan printing. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + using VPBlockBase::print; // Get the print(raw_stream &O) version. + private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. @@ -1554,6 +1574,16 @@ public: void execute(struct VPTransformState *State) override; void dropAllReferences(VPValue *NewValue) override; + + /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with + /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using + /// consequtive numbers. + /// + /// Note that the numbering is applied to the whole VPlan, so printing + /// individual regions is consistent with the whole VPlan printing. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; + using VPBlockBase::print; // Get the print(raw_stream &O) version. }; //===----------------------------------------------------------------------===// @@ -1806,6 +1836,12 @@ public: VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } + /// Print this VPlan to \p O. + void print(raw_ostream &O) const; + + /// Print this VPlan in DOT format to \p O. + void printDOT(raw_ostream &O) const; + /// Dump the plan to stderr (for debugging). void dump() const; @@ -1830,11 +1866,6 @@ private: /// VPlanPrinter prints a given VPlan to a given output stream. The printing is /// indented and follows the dot format. class VPlanPrinter { - friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan); - friend inline raw_ostream &operator<<(raw_ostream &OS, - const struct VPlanIngredient &I); - -private: raw_ostream &OS; const VPlan &Plan; unsigned Depth = 0; @@ -1845,9 +1876,6 @@ private: VPSlotTracker SlotTracker; - VPlanPrinter(raw_ostream &O, const VPlan &P) - : OS(O), Plan(P), SlotTracker(&P) {} - /// Handle indentation. void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } @@ -1877,25 +1905,28 @@ private: void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, const Twine &Label); - void dump(); +public: + VPlanPrinter(raw_ostream &O, const VPlan &P) + : OS(O), Plan(P), SlotTracker(&P) {} - static void printAsIngredient(raw_ostream &O, const Value *V); + void dump(); }; struct VPlanIngredient { const Value *V; VPlanIngredient(const Value *V) : V(V) {} + + void print(raw_ostream &O) const; }; inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) { - VPlanPrinter::printAsIngredient(OS, I.V); + I.print(OS); return OS; } inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { - VPlanPrinter Printer(OS, Plan); - Printer.dump(); + Plan.print(OS); return OS; } diff --git a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll index 6aa385d1df8d..181a7d70da82 100644 --- a/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ b/llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -36,12 +36,13 @@ for.end: } ; Check for crash exposed by D76992. -; CHECK: N0 [label = -; CHECK-NEXT: "loop:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi 0, %iv.next\l" + -; CHECK-NEXT: "WIDEN ir<%cond0> = icmp ir<%iv>, ir<13>\l" + -; CHECK-NEXT: "WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20>\l" -; CHECK-NEXT: ] +; CHECK: VPlan { +; CHECK-NEXT: loop: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next +; CHECK-NEXT: WIDEN ir<%cond0> = icmp ir<%iv>, ir<13> +; CHECK-NEXT: WIDEN-SELECT ir<%s> = select ir<%cond0>, ir<10>, ir<20> +; CHECK-NEXT: No successor +; CHECK-NEXT: } define void @test() { entry: br label %loop diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll new file mode 100644 index 000000000000..7d8d18dcfdaa --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -0,0 +1,40 @@ +; REQUIRES: asserts + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -vplan-print-in-dot-format -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; Verify that -vplan-print-in-dot-format option works. + +define void @print_call_and_memory(i64 %n, float* noalias %y, float* noalias %x) nounwind uwtable { +; CHECK: N0 [label = +; CHECK-NEXT: "for.body:\l" + +; CHECK-NEXT: " WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + +; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr ir\<%y\>, ir\<%iv\>\l" + +; CHECK-NEXT: " WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" + +; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>)\l" + +; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr ir\<%x\>, ir\<%iv\>\l" + +; CHECK-NEXT: " WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" + +; CHECK-NEXT: "No successors\l" +; CHECK-NEXT: ] +; +entry: + %cmp6 = icmp sgt i64 %n, 0 + br i1 %cmp6, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, float* %y, i64 %iv + %lv = load float, float* %arrayidx, align 4 + %call = tail call float @llvm.sqrt.f32(float %lv) nounwind readnone + %arrayidx2 = getelementptr inbounds float, float* %x, i64 %iv + store float %call, float* %arrayidx2, align 4 + %iv.next = add i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare float @llvm.sqrt.f32(float) nounwind readnone diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 1f649f3dc206..93718ffbeab9 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -7,16 +7,17 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Tests for printing VPlans. define void @print_call_and_memory(i64 %n, float* noalias %y, float* noalias %x) nounwind uwtable { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + -; CHECK-NEXT: "CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + -; CHECK-NEXT: "WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>)\l" + -; CHECK-NEXT: "CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN store ir<%arrayidx2>, ir<%call>\l" -; CHECK-NEXT: ] - +; CHECK: VPlan { +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> +; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>) +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv> +; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%call> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -37,18 +38,19 @@ for.end: ; preds = %for.body, %entry } define void @print_widen_gep_and_select(i64 %n, float* noalias %y, float* noalias %x, float* %z) nounwind uwtable { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + -; CHECK-NEXT: "WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + -; CHECK-NEXT: "WIDEN ir<%cmp> = icmp ir<%arrayidx>, ir<%z>\l" + -; CHECK-NEXT: "WIDEN-SELECT ir<%sel> = select ir<%cmp>, ir<1.000000e+01>, ir<2.000000e+01>\l" + -; CHECK-NEXT: "WIDEN ir<%add> = fadd ir<%lv>, ir<%sel>\l" + -; CHECK-NEXT: "CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN store ir<%arrayidx2>, ir<%add>\l" -; CHECK-NEXT: ] - +; CHECK: VPlan { +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 +; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> +; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%arrayidx>, ir<%z> +; CHECK-NEXT: WIDEN-SELECT ir<%sel> = select ir<%cmp>, ir<1.000000e+01>, ir<2.000000e+01> +; CHECK-NEXT: WIDEN ir<%add> = fadd ir<%lv>, ir<%sel> +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%x>, ir<%iv> +; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%add> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; entry: %cmp6 = icmp sgt i64 %n, 0 br i1 %cmp6, label %for.body, label %for.end @@ -71,15 +73,16 @@ for.end: ; preds = %for.body, %entry } define float @print_reduction(i64 %n, float* noalias %y) { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %iv = phi %iv.next, 0\l" + -; CHECK-NEXT: "WIDEN-PHI %red = phi %red.next, 0.000000e+00\l" + -; CHECK-NEXT: "CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv>\l" + -; CHECK-NEXT: "WIDEN ir<%lv> = load ir<%arrayidx>\l" + -; CHECK-NEXT: "REDUCE ir<%red.next> = ir<%red> + reduce.fadd (ir<%lv>)\l" -; CHECK-NEXT: ] - +; CHECK: VPlan { +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0 +; CHECK-NEXT: WIDEN-PHI %red = phi %red.next, 0.000000e+00 +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%y>, ir<%iv> +; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.fadd (ir<%lv>) +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; entry: br label %for.body @@ -98,36 +101,40 @@ for.end: ; preds = %for.body, %entry } define void @print_replicate_predicated_phi(i64 %n, i64* %x) { -; CHECK: N0 [label = -; CHECK-NEXT: "for.body:\n" + -; CHECK-NEXT: "WIDEN-INDUCTION %i = phi 0, %i.next\l" + -; CHECK-NEXT: "WIDEN ir<%cmp> = icmp ir<%i>, ir<5>\l" -; CHECK-NEXT: ] -; -; CHECK: N2 [label = -; CHECK-NEXT: "pred.udiv.entry:\n" + -; CHECK-NEXT: + -; CHECK-NEXT: "BRANCH-ON-MASK ir<%cmp>\l"\l -; CHECK-NEXT: "CondBit: ir<%cmp>" -; CHECK-NEXT: ] -; -; CHECK: N4 [label = -; CHECK-NEXT: "pred.udiv.if:\n" + -; CHECK-NEXT: "REPLICATE ir<%tmp4> = udiv ir<%n>, ir<%i> (S->V)\l" -; CHECK-NEXT: ] -; -; CHECK: N5 [label = -; CHECK-NEXT: "pred.udiv.continue:\n" + -; CHECK-NEXT: "PHI-PREDICATED-INSTRUCTION vp<%3> = ir<%tmp4>\l" -; CHECK-NEXT: ] -; -; CHECK: N7 [label = -; CHECK-NEXT: "for.inc:\n" + -; CHECK-NEXT: "EMIT vp<%4> = not ir<%cmp>\l" + -; CHECK-NEXT: "BLEND %d = ir<0>/vp<%4> vp<%3>/ir<%cmp>\l" + -; CHECK-NEXT: "CLONE ir<%idx> = getelementptr ir<%x>, ir<%i>\l" + -; CHECK-NEXT: "WIDEN store ir<%idx>, ir<%d>\l" -; CHECK-NEXT: ] +; CHECK: VPlan { +; CHECK-NEXT: for.body: +; CHECK-NEXT: WIDEN-INDUCTION %i = phi 0, %i.next +; CHECK-NEXT: WIDEN ir<%cmp> = icmp ir<%i>, ir<5> +; CHECK-NEXT: Successor(s): if.then +; CHECK-EMPTY: +; CHECK-NEXT: if.then: +; CHECK-NEXT: Successor(s): pred.udiv +; CHECK-EMPTY: +; CHECK-NEXT: pred.udiv: { +; CHECK-NEXT: pred.udiv.entry: +; CHECK-NEXT: BRANCH-ON-MASK ir<%cmp> +; CHECK-NEXT: Successor(s): pred.udiv.if, pred.udiv.continue +; CHECK-NEXT: CondBit: ir<%cmp> +; CHECK-EMPTY: +; CHECK-NEXT: pred.udiv.if: +; CHECK-NEXT: REPLICATE ir<%tmp4> = udiv ir<%n>, ir<%i> (S->V) +; CHECK-NEXT: Successor(s): pred.udiv.continue +; CHECK-EMPTY: +; CHECK-NEXT: pred.udiv.continue: +; CHECK-NEXT: PHI-PREDICATED-INSTRUCTION vp<%3> = ir<%tmp4> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-EMPTY: +; CHECK-NEXT: if.then.0: +; CHECK-NEXT: Successor(s): for.inc +; CHECK-EMPTY: +; CHECK-NEXT: for.inc: +; CHECK-NEXT: EMIT vp<%4> = not ir<%cmp> +; CHECK-NEXT: BLEND %d = ir<0>/vp<%4> vp<%3>/ir<%cmp> +; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, ir<%i> +; CHECK-NEXT: WIDEN store ir<%idx>, ir<%d> +; CHECK-NEXT: No successors +; CHECK-NEXT: } ; entry: br label %for.body diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index 880b8f711462..cf314043f011 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -93,7 +93,8 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { // as this is not required with the new printing. Plan->addVPValue(&*F->arg_begin()); std::string FullDump; - raw_string_ostream(FullDump) << *Plan; + raw_string_ostream OS(FullDump); + Plan->printDOT(OS); const char *ExpectedStr = R"(digraph VPlan { graph [labelloc=t, fontsize=30; label="Vectorization Plan"] node [shape=rect, fontname=Courier, fontsize=30] @@ -103,25 +104,28 @@ compound=true fontname=Courier label="\ TopRegion" N1 [label = - "entry:\n" + "entry:\l" + + "Successor(s): for.body\l" ] N1 -> N2 [ label=""] N2 [label = - "for.body:\n" + - "WIDEN-PHI %indvars.iv = phi 0, %indvars.iv.next\l" + - "EMIT ir<%arr.idx> = getelementptr ir<%A> ir<%indvars.iv>\l" + - "EMIT ir<%l1> = load ir<%arr.idx>\l" + - "EMIT ir<%res> = add ir<%l1> ir<10>\l" + - "EMIT store ir<%res> ir<%arr.idx>\l" + - "EMIT ir<%indvars.iv.next> = add ir<%indvars.iv> ir<1>\l" + - "EMIT ir<%exitcond> = icmp ir<%indvars.iv.next> ir<%N>\l" + - "CondBit: ir<%exitcond> (for.body)\l" + "for.body:\l" + + " WIDEN-PHI %indvars.iv = phi 0, %indvars.iv.next\l" + + " EMIT ir\<%arr.idx\> = getelementptr ir\<%A\> ir\<%indvars.iv\>\l" + + " EMIT ir\<%l1\> = load ir\<%arr.idx\>\l" + + " EMIT ir\<%res\> = add ir\<%l1\> ir\<10\>\l" + + " EMIT store ir\<%res\> ir\<%arr.idx\>\l" + + " EMIT ir\<%indvars.iv.next\> = add ir\<%indvars.iv\> ir\<1\>\l" + + " EMIT ir\<%exitcond\> = icmp ir\<%indvars.iv.next\> ir\<%N\>\l" + + "Successor(s): for.body, for.end\l" + + "CondBit: ir\<%exitcond\> (for.body)\l" ] N2 -> N2 [ label="T"] N2 -> N3 [ label="F"] N3 [label = - "for.end:\n" + - "EMIT ret\l" + "for.end:\l" + + " EMIT ret\l" + + "No successors\l" ] } } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index f8f1562d548c..71f27f95bad7 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -333,12 +333,14 @@ TEST(VPBasicBlockTest, print) { VPBB1->appendRecipe(I1); VPBB1->appendRecipe(I2); VPBB1->appendRecipe(I3); + VPBB1->setName("bb1"); VPInstruction *I4 = new VPInstruction(Instruction::Mul, {I2, I1}); VPInstruction *I5 = new VPInstruction(Instruction::Ret, {I4}); VPBasicBlock *VPBB2 = new VPBasicBlock(); VPBB2->appendRecipe(I4); VPBB2->appendRecipe(I5); + VPBB2->setName("bb2"); VPBlockUtils::connectBlocks(VPBB1, VPBB2); @@ -355,7 +357,8 @@ TEST(VPBasicBlockTest, print) { VPlan Plan; Plan.setEntry(VPBB1); std::string FullDump; - raw_string_ostream(FullDump) << Plan; + raw_string_ostream OS(FullDump); + Plan.printDOT(OS); const char *ExpectedStr = R"(digraph VPlan { graph [labelloc=t, fontsize=30; label="Vectorization Plan"] @@ -363,21 +366,45 @@ node [shape=rect, fontname=Courier, fontsize=30] edge [fontname=Courier, fontsize=30] compound=true N0 [label = - ":\n" + - "EMIT vp<%0> = add\l" + - "EMIT vp<%1> = sub vp<%0>\l" + - "EMIT br vp<%0> vp<%1>\l" + "bb1:\l" + + " EMIT vp\<%0\> = add\l" + + " EMIT vp\<%1\> = sub vp\<%0\>\l" + + " EMIT br vp\<%0\> vp\<%1\>\l" + + "Successor(s): bb2\l" ] N0 -> N1 [ label=""] N1 [label = - ":\n" + - "EMIT vp<%3> = mul vp<%1> vp<%0>\l" + - "EMIT ret vp<%3>\l" + "bb2:\l" + + " EMIT vp\<%3\> = mul vp\<%1\> vp\<%0\>\l" + + " EMIT ret vp\<%3\>\l" + + "No successors\l" ] } )"; EXPECT_EQ(ExpectedStr, FullDump); + const char *ExpectedBlock1Str = R"(bb1: + EMIT vp<%0> = add + EMIT vp<%1> = sub vp<%0> + EMIT br vp<%0> vp<%1> +Successor(s): bb2 +)"; + std::string Block1Dump; + raw_string_ostream OS1(Block1Dump); + VPBB1->print(OS1); + EXPECT_EQ(ExpectedBlock1Str, Block1Dump); + + // Ensure that numbering is good when dumping the second block in isolation. + const char *ExpectedBlock2Str = R"(bb2: + EMIT vp<%3> = mul vp<%1> vp<%0> + EMIT ret vp<%3> +No successors +)"; + std::string Block2Dump; + raw_string_ostream OS2(Block2Dump); + VPBB2->print(OS2); + EXPECT_EQ(ExpectedBlock2Str, Block2Dump); + { std::string I3Dump; raw_string_ostream OS(I3Dump); -- GitLab From 92205cb27fd80bcb605cc0a424c8d9e9dde374c5 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Fri, 19 Mar 2021 09:41:44 -0700 Subject: [PATCH 0172/1000] [NFC][VPlan] Guard print routines with "#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)" Reviewed By: mehdi_amini Differential Revision: https://reviews.llvm.org/D98897 --- .../Vectorize/LoopVectorizationPlanner.h | 2 + .../Transforms/Vectorize/LoopVectorize.cpp | 4 ++ llvm/lib/Transforms/Vectorize/VPlan.cpp | 19 +++++- llvm/lib/Transforms/Vectorize/VPlan.h | 58 ++++++++++++++++--- llvm/lib/Transforms/Vectorize/VPlanSLP.cpp | 2 + llvm/lib/Transforms/Vectorize/VPlanValue.h | 6 ++ .../Transforms/Vectorize/VPlanHCFGTest.cpp | 2 + .../Transforms/Vectorize/VPlanTest.cpp | 6 ++ 8 files changed, 89 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index fae75e318b42..70e1226e0ebf 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -256,7 +256,9 @@ public: /// best selected VPlan. void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printPlans(raw_ostream &O); +#endif /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 61b6fa1bcc63..ea04ea3c45ee 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7813,6 +7813,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV, ILV.printDebugTracesAtEnd(); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LoopVectorizationPlanner::printPlans(raw_ostream &O) { for (const auto &Plan : VPlans) if (PrintVPlansInDotFormat) @@ -7820,6 +7821,7 @@ void LoopVectorizationPlanner::printPlans(raw_ostream &O) { else Plan->print(O); } +#endif void LoopVectorizationPlanner::collectTriviallyDeadInstructions( SmallPtrSetImpl &DeadInstructions) { @@ -9017,6 +9019,7 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at "; @@ -9032,6 +9035,7 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent, if (Instruction *I = IG->getMember(i)) O << "\n" << Indent << " " << VPlanIngredient(I) << " " << i; } +#endif void VPWidenCallRecipe::execute(VPTransformState &State) { State.ILV->widenCallInstruction(*cast(getUnderlyingInstr()), this, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index d8df4a710d88..321ab377aa8b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -50,6 +50,7 @@ extern cl::opt EnableVPlanNativePath; #define DEBUG_TYPE "vplan" +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { const VPInstruction *Instr = dyn_cast(&V); VPSlotTracker SlotTracker( @@ -57,6 +58,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { V.print(OS, SlotTracker); return OS; } +#endif Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const { @@ -83,6 +85,7 @@ VPValue::~VPValue() { Def->removeDefinedValue(this); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const { if (const VPRecipeBase *R = dyn_cast_or_null(Def)) R->print(OS, "", SlotTracker); @@ -105,6 +108,7 @@ void VPDef::dump() const { print(dbgs(), "", SlotTracker); dbgs() << "\n"; } +#endif // Get the top-most entry block of \p Start. This is the entry block of the // containing VPlan. This function is templated to support both const and non-const blocks @@ -399,6 +403,7 @@ void VPBasicBlock::dropAllReferences(VPValue *NewValue) { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << getName() << ":\n"; @@ -434,6 +439,7 @@ void VPBasicBlock::print(raw_ostream &O, const Twine &Indent, O << '\n'; } } +#endif void VPRegionBlock::dropAllReferences(VPValue *NewValue) { for (VPBlockBase *Block : depth_first(Entry)) @@ -491,6 +497,7 @@ void VPRegionBlock::execute(VPTransformState *State) { State->Instance.reset(); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << (isReplicator() ? " " : " ") << getName() << ": {"; @@ -501,6 +508,7 @@ void VPRegionBlock::print(raw_ostream &O, const Twine &Indent, } O << Indent << "}\n"; } +#endif void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) { assert(!Parent && "Recipe already in some VPBasicBlock"); @@ -601,6 +609,7 @@ void VPInstruction::execute(VPTransformState &State) { generateInstruction(State, Part); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPInstruction::dump() const { VPSlotTracker SlotTracker(getParent()->getPlan()); print(dbgs(), "", SlotTracker); @@ -641,6 +650,7 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, Operand->printAsOperand(O, SlotTracker); } } +#endif /// Generate the code inside the body of the vectorized loop. Assumes a single /// LoopVectorBody basic-block was created for this. Introduce additional @@ -730,7 +740,7 @@ void VPlan::execute(VPTransformState *State) { L->getExitBlock()); } -// TODO: Wrap those in #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)/#endif. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void VPlan::print(raw_ostream &O) const { VPSlotTracker SlotTracker(this); @@ -749,7 +759,6 @@ void VPlan::printDOT(raw_ostream &O) const { Printer.dump(); } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void VPlan::dump() const { print(dbgs()); } #endif @@ -794,6 +803,7 @@ void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB, assert(DT->verify(DominatorTree::VerificationLevel::Fast)); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) const Twine VPlanPrinter::getUID(const VPBlockBase *Block) { return (isa(Block) ? "cluster_N" : "N") + Twine(getOrCreateBID(Block)); @@ -1072,6 +1082,7 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent, printOperands(O, SlotTracker); } +#endif void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.CanonicalIV; @@ -1098,12 +1109,14 @@ void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; getVPValue()->printAsOperand(O, SlotTracker); O << " = WIDEN-CANONICAL-INDUCTION"; } +#endif template void DomTreeBuilder::Calculate(VPDominatorTree &DT); @@ -1122,6 +1135,7 @@ void VPValue::replaceAllUsesWith(VPValue *New) { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const { if (const Value *UV = getUnderlyingValue()) { OS << "ir<"; @@ -1142,6 +1156,7 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { Op->printAsOperand(O, SlotTracker); }); } +#endif void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 5a98c63401b0..f27628572ce0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -573,10 +573,6 @@ public: /// Delete all blocks reachable from a given VPBlockBase, inclusive. static void deleteCFG(VPBlockBase *Entry); - void printAsOperand(raw_ostream &OS, bool PrintType) const { - OS << getName(); - } - /// Return true if it is legal to hoist instructions into this block. bool isLegalToHoistInto() { // There are currently no constraints that prevent an instruction to be @@ -588,6 +584,11 @@ public: /// replaces all uses of VPValues defined in the block with NewValue. virtual void dropAllReferences(VPValue *NewValue) = 0; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void printAsOperand(raw_ostream &OS, bool PrintType) const { + OS << getName(); + } + /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using /// consequtive numbers. @@ -604,7 +605,8 @@ public: } /// Dump this VPBlockBase to dbgs(). - void dump() const { print(dbgs()); } + LLVM_DUMP_METHOD void dump() const { print(dbgs()); } +#endif }; /// VPRecipeBase is a base class modeling a sequence of one or more output IR @@ -760,12 +762,14 @@ public: /// provided. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the VPInstruction to \p O. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; /// Print the VPInstruction to dbgs() (for debugging). - void dump() const; + LLVM_DUMP_METHOD void dump() const; +#endif /// Return true if this instruction may modify memory. bool mayWriteToMemory() const { @@ -819,9 +823,11 @@ public: /// Produce widened copies of all Ingredients. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A recipe for widening Call instructions. @@ -843,9 +849,11 @@ public: /// Produce a widened version of the call instruction. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A recipe for widening select instructions. @@ -872,9 +880,11 @@ public: /// Produce a widened version of the select instruction. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A recipe for handling GEP instructions. @@ -910,9 +920,11 @@ public: /// Generate the gep nodes. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A recipe for handling phi nodes of integer and floating-point inductions, @@ -943,9 +955,11 @@ public: /// needed by their users. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif /// Returns the start value of the induction. VPValue *getStartValue() { return getOperand(0); } @@ -1005,9 +1019,11 @@ public: /// Generate the phi/select nodes. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif /// Returns the start value of the phi, if it is a reduction. VPValue *getStartValue() { @@ -1063,9 +1079,11 @@ public: /// Generate the phi/select nodes. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// VPInterleaveRecipe is a recipe for transforming an interleave group of load @@ -1126,9 +1144,11 @@ public: /// Generate the wide load or store, and shuffles. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif const InterleaveGroup *getInterleaveGroup() { return IG; } }; @@ -1166,9 +1186,11 @@ public: /// Generate the reduction in the loop void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif /// The VPValue of the scalar Chain being accumulated. VPValue *getChainOp() const { return getOperand(0); } @@ -1226,9 +1248,11 @@ public: void setAlsoPack(bool Pack) { AlsoPack = Pack; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif bool isUniform() const { return IsUniform; } @@ -1255,6 +1279,7 @@ public: /// conditional branch. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override { @@ -1264,6 +1289,7 @@ public: else O << " All-One"; } +#endif /// Return the mask used by this recipe. Note that a full mask is represented /// by a nullptr. @@ -1296,9 +1322,11 @@ public: /// Generates phi nodes for live-outs as needed to retain SSA form. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A Recipe for widening load/store operations. @@ -1363,9 +1391,11 @@ public: /// Generate the wide load/store. void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// A Recipe for widening the canonical induction variable of the vector loop. @@ -1387,9 +1417,11 @@ public: /// step = . void execute(VPTransformState &State) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the recipe. void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; +#endif }; /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It @@ -1474,6 +1506,7 @@ public: void dropAllReferences(VPValue *NewValue) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p /// SlotTracker is used to print unnamed VPValue's using consequtive numbers. /// @@ -1482,6 +1515,7 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; using VPBlockBase::print; // Get the print(raw_stream &O) version. +#endif private: /// Create an IR BasicBlock to hold the output instructions generated by this @@ -1575,6 +1609,7 @@ public: void dropAllReferences(VPValue *NewValue) override; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using /// consequtive numbers. @@ -1584,6 +1619,7 @@ public: void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override; using VPBlockBase::print; // Get the print(raw_stream &O) version. +#endif }; //===----------------------------------------------------------------------===// @@ -1836,6 +1872,7 @@ public: VPLoopInfo &getVPLoopInfo() { return VPLInfo; } const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPlan to \p O. void print(raw_ostream &O) const; @@ -1843,7 +1880,8 @@ public: void printDOT(raw_ostream &O) const; /// Dump the plan to stderr (for debugging). - void dump() const; + LLVM_DUMP_METHOD void dump() const; +#endif /// Returns a range mapping the values the range \p Operands to their /// corresponding VPValues. @@ -1863,6 +1901,7 @@ private: BasicBlock *LoopExitBB); }; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// VPlanPrinter prints a given VPlan to a given output stream. The printing is /// indented and follows the dot format. class VPlanPrinter { @@ -1909,7 +1948,7 @@ public: VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P), SlotTracker(&P) {} - void dump(); + LLVM_DUMP_METHOD void dump(); }; struct VPlanIngredient { @@ -1929,6 +1968,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { Plan.print(OS); return OS; } +#endif //===----------------------------------------------------------------------===// // VPlan Utilities @@ -2144,8 +2184,10 @@ class VPlanSlp { SmallPtrSetImpl &Candidates, VPInterleavedAccessInfo &IAI); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print bundle \p Values to dbgs(). void dumpBundle(ArrayRef Values); +#endif public: VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index 39c879d45647..fd02805d971f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -349,6 +349,7 @@ SmallVector VPlanSlp::reorderMultiNodeOps() { return FinalOrder; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPlanSlp::dumpBundle(ArrayRef Values) { dbgs() << " Ops: "; for (auto Op : Values) { @@ -361,6 +362,7 @@ void VPlanSlp::dumpBundle(ArrayRef Values) { } dbgs() << "\n"; } +#endif VPInstruction *VPlanSlp::buildGraph(ArrayRef Values) { assert(!Values.empty() && "Need some operands!"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 55c2c748a5b9..81bd221432d4 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -116,11 +116,13 @@ public: /// for any other purpose, as the values may change as LLVM evolves. unsigned getVPValueID() const { return SubclassID; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const; void print(raw_ostream &OS, VPSlotTracker &Tracker) const; /// Dump the value to stderr (for debugging). void dump() const; +#endif unsigned getNumUsers() const { return Users.size(); } void addUser(VPUser &User) { Users.push_back(&User); } @@ -192,8 +194,10 @@ class VPUser { SmallVector Operands; protected: +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print the operands to \p O. void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const; +#endif public: VPUser() {} @@ -347,12 +351,14 @@ public: /// for any other purpose, as the values may change as LLVM evolves. unsigned getVPDefID() const { return SubclassID; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Dump the VPDef to stderr (for debugging). void dump() const; /// Each concrete VPDef prints itself. virtual void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const = 0; +#endif }; class VPlan; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index cf314043f011..853be5757731 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -89,6 +89,7 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { EXPECT_EQ(IndvarAdd, ICmp->getOperand(0)); EXPECT_EQ(VecBB->getCondBit(), ICmp); +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) // Add an external value to check we do not print the list of external values, // as this is not required with the new printing. Plan->addVPValue(&*F->arg_begin()); @@ -131,6 +132,7 @@ compound=true } )"; EXPECT_EQ(ExpectedStr, FullDump); +#endif LoopVectorizationLegality::InductionList Inductions; SmallPtrSet DeadInstructions; diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 71f27f95bad7..2836e8199678 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -324,6 +324,7 @@ TEST(VPBasicBlockTest, getPlan) { } } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) TEST(VPBasicBlockTest, print) { VPInstruction *I1 = new VPInstruction(Instruction::Add, {}); VPInstruction *I2 = new VPInstruction(Instruction::Sub, {I1}); @@ -422,6 +423,7 @@ No successors EXPECT_EQ("EMIT vp<%3> = mul vp<%1> vp<%0>", I4Dump); } } +#endif TEST(VPRecipeTest, CastVPInstructionToVPUser) { VPValue Op1; @@ -608,6 +610,7 @@ TEST(VPRecipeTest, CastVPWidenMemoryInstructionRecipeToVPUserAndVPDef) { delete Load; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) TEST(VPRecipeTest, dump) { VPlan Plan; VPBasicBlock *VPBB1 = new VPBasicBlock(); @@ -663,6 +666,7 @@ TEST(VPRecipeTest, dump) { delete AI; } +#endif TEST(VPRecipeTest, CastVPReductionRecipeToVPUser) { LLVMContext C; @@ -684,8 +688,10 @@ struct VPDoubleValueDef : public VPRecipeBase { } void execute(struct VPTransformState &State) override{}; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const override {} +#endif }; TEST(VPDoubleValueDefTest, traverseUseLists) { -- GitLab From fbc1f48daf1b8945516c0f8d16af24fc3c5d6f62 Mon Sep 17 00:00:00 2001 From: Jianzhou Zhao Date: Fri, 19 Mar 2021 17:53:13 +0000 Subject: [PATCH 0173/1000] [dfsan] Turn on testing origin tracking at atomics.ll --- llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll index c917774b4506..0075170410fe 100644 --- a/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll +++ b/llvm/test/Instrumentation/DataFlowSanitizer/atomics.ll @@ -1,7 +1,7 @@ ; RUN: opt < %s -dfsan -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16 ; RUN: opt < %s -dfsan -dfsan-fast-8-labels=true -S | FileCheck %s --check-prefixes=CHECK -; -; The patterns about origins cannot be tested until the origin tracking feature is complete. +; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -S | FileCheck %s --check-prefixes=CHECK,CHECK16,CHECK_ORIGIN +; RUN: opt < %s -dfsan -dfsan-track-origins=1 -dfsan-fast-16-labels=true -dfsan-instrument-with-call-threshold=0 -S | FileCheck %s --check-prefixes=CHECK,CHECK16,CHECK_ORIGIN target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" -- GitLab From 95998b898c68206bf0693cc5c1fd17ab9a395cef Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 19 Mar 2021 10:47:32 -0700 Subject: [PATCH 0174/1000] [Hexagon] Return an i64 for result 0 from LowerREADCYCLECOUNTER instead of an i32. As far as I can tell, the node coming in has an i64 result so the return should have the same type. The HexagonISD node used for this has a type profile that says the result is i64. Found while trying to add assserts to LegalizeDAG to catch result type mismatches. Reviewed By: kparzysz Differential Revision: https://reviews.llvm.org/D98962 --- llvm/lib/Target/Hexagon/HexagonISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index a7e9ed34bfcb..153c7e9d9489 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -703,7 +703,7 @@ SDValue HexagonTargetLowering::LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const { SDValue Chain = Op.getOperand(0); SDLoc dl(Op); - SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); return DAG.getNode(HexagonISD::READCYCLE, dl, VTs, Chain); } -- GitLab From 5d315691c42b57d1858d0f8dc486708bf839cdb3 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 19 Mar 2021 10:47:47 -0700 Subject: [PATCH 0175/1000] [RISCV] Add missing bitcasts to the results of lowerINSERT_SUBVECTOR and lowerEXTRACT_SUBVECTOR when handling mask vectors. Found by adding asserts to LegalizeDAG to catch incorrect result types being returned. Reviewed By: frasercrmck Differential Revision: https://reviews.llvm.org/D98964 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6dfc2d46afe1..3bde5158c9b1 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -2876,9 +2876,9 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, SDValue SlideupAmt = DAG.getConstant(OrigIdx, DL, XLenVT); SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, ContainerVT, Vec, SubVec, SlideupAmt, Mask, VL); - if (!VecVT.isFixedLengthVector()) - return Slideup; - return convertFromScalableVector(VecVT, Slideup, DAG, Subtarget); + if (VecVT.isFixedLengthVector()) + Slideup = convertFromScalableVector(VecVT, Slideup, DAG, Subtarget); + return DAG.getBitcast(Op.getValueType(), Slideup); } unsigned SubRegIdx, RemIdx; @@ -3025,8 +3025,9 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op, DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec, SlidedownAmt, Mask, VL); // Now we can use a cast-like subvector extract to get the result. - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown, - DAG.getConstant(0, DL, XLenVT)); + Slidedown = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, Slidedown, + DAG.getConstant(0, DL, XLenVT)); + return DAG.getBitcast(Op.getValueType(), Slidedown); } unsigned SubRegIdx, RemIdx; -- GitLab From 5737010a7948441c78c2a367afa7c86efc8ae268 Mon Sep 17 00:00:00 2001 From: Bjorn Pettersson Date: Tue, 16 Mar 2021 12:47:16 +0100 Subject: [PATCH 0176/1000] [LangRef] Describe memory layout for vectors types There are a couple of caveats when it comes to how vectors are stored to memory, and thereby also how bitcast between vector and integer types work, in LLVM IR. Specially in relation to endianess. This patch is an attempt to document such things. Reviewed By: nlopes Differential Revision: https://reviews.llvm.org/D94964 --- llvm/docs/LangRef.rst | 68 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 54fb8945324b..142556c55777 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -3200,6 +3200,63 @@ and a scalable property to represent vectors where the exact hardware vector length is unknown at compile time. Vector types are considered :ref:`first class `. +:Memory Layout: + +In general vector elements are laid out in memory in the same way as +:ref:`array types `. Such an anology works fine as long as the vector +elements are byte sized. However, when the elements of the vector aren't byte +sized it gets a bit more complicated. One way to describe the layout is by +describing what happens when a vector such as is bitcasted to an +integer type with N*M bits, and then following the rules for storing such an +integer to memory. + +A bitcast from a vector type to a scalar integer type will see the elements +being packed together (without padding). The order in which elements are +inserted in the integer depends on endianess. For little endian element zero +is put in the least significant bits of the integer, and for big endian +element zero is put in the most significant bits. + +Using a vector such as ```` as an example, together +with the analogy that we can replace a vector store by a bitcast followed by +an integer store, we ge this for big endian: + +.. code-block:: llvm + + %val = bitcast <4 x i4> to i16 + + ; Bitcasting from a vector to an integral type can be seen as + ; concatenating the values: + ; %val now has the hexadecimal value 0x1235. + + store i16 %val, i16* %ptr + + ; In memory the content will be (8-bit addressing): + ; + ; [%ptr + 0]: 00010010 (0x12) + ; [%ptr + 1]: 00110101 (0x35) + +The same example for little endian: + +.. code-block:: llvm + + %val = bitcast <4 x i4> to i16 + + ; Bitcasting from a vector to an integral type can be seen as + ; concatenating the values: + ; %val now has the hexadecimal value 0x5321. + + store i16 %val, i16* %ptr + + ; In memory the content will be (8-bit addressing): + ; + ; [%ptr + 0]: 01010011 (0x53) + ; [%ptr + 1]: 00100001 (0x21) + +When ```` isn't evenly divisible by the byte size the exact memory layout +is unspecified (just like it is for an integral type of the same size). This +is because different targets could put the padding at different positions when +the type size is smaller than the types store size. + :Syntax: :: @@ -10604,14 +10661,19 @@ pointers) types with the same address space through this instruction. To convert pointers to other types, use the :ref:`inttoptr ` or :ref:`ptrtoint ` instructions first. +There is a caveat for bitcasts involving vector types in relation to +endianess. For example ``bitcast <2 x i8> to i16`` puts element zero +of the vector in the least significant bits of the i16 for little-endian while +element zero ends up in the most significant bits for big-endian. + Example: """""""" .. code-block:: text - %X = bitcast i8 255 to i8 ; yields i8 :-1 - %Y = bitcast i32* %x to sint* ; yields sint*:%x - %Z = bitcast <2 x int> %V to i64; ; yields i64: %V + %X = bitcast i8 255 to i8 ; yields i8 :-1 + %Y = bitcast i32* %x to sint* ; yields sint*:%x + %Z = bitcast <2 x int> %V to i64; ; yields i64: %V (depends on endianess) %Z = bitcast <2 x i32*> %V to <2 x i64*> ; yields <2 x i64*> .. _i_addrspacecast: -- GitLab From 14ae0cf0f5cde5a5e64b955dfda5b5af3e882cdb Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 2 Mar 2021 10:53:49 -0800 Subject: [PATCH 0177/1000] [Cost]Canonicalize the cost for logical or/and reductions. The generic cost of logical or/and reductions should be cost of bitcast to iReduxWidth + cmp eq|ne iReduxWidth. Differential Revision: https://reviews.llvm.org/D97961 --- llvm/include/llvm/CodeGen/BasicTTIImpl.h | 16 +++++++++ .../SystemZ/SystemZTargetTransformInfo.cpp | 7 +++- .../Analysis/CostModel/AArch64/reduce-and.ll | 12 +++---- .../Analysis/CostModel/AArch64/reduce-or.ll | 12 +++---- .../Analysis/CostModel/AMDGPU/reduce-and.ll | 14 ++++---- .../Analysis/CostModel/AMDGPU/reduce-or.ll | 14 ++++---- .../test/Analysis/CostModel/ARM/reduce-and.ll | 14 ++++---- llvm/test/Analysis/CostModel/ARM/reduce-or.ll | 14 ++++---- .../Analysis/CostModel/PowerPC/reduce-and.ll | 14 ++++---- .../Analysis/CostModel/PowerPC/reduce-or.ll | 14 ++++---- .../Analysis/CostModel/RISCV/reduce-and.ll | 35 ++++++++++++------- .../Analysis/CostModel/RISCV/reduce-or.ll | 35 ++++++++++++------- .../Analysis/CostModel/SystemZ/reduce-and.ll | 14 ++++---- .../Analysis/CostModel/SystemZ/reduce-or.ll | 14 ++++---- 14 files changed, 136 insertions(+), 93 deletions(-) diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index 39d3812a68d5..9b043fe98b2d 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1895,6 +1895,22 @@ public: TTI::TargetCostKind CostKind) { Type *ScalarTy = Ty->getElementType(); unsigned NumVecElts = cast(Ty)->getNumElements(); + if ((Opcode == Instruction::Or || Opcode == Instruction::And) && + ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && + NumVecElts >= 2) { + // Or reduction for i1 is represented as: + // %val = bitcast to iReduxWidth + // %res = cmp ne iReduxWidth %val, 0 + // And reduction for i1 is represented as: + // %val = bitcast to iReduxWidth + // %res = cmp eq iReduxWidth %val, 11111 + Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); + return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, + TTI::CastContextHint::None, CostKind) + + thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, + CmpInst::makeCmpResultType(ValTy), + CmpInst::BAD_ICMP_PREDICATE, CostKind); + } unsigned NumReduxLevels = Log2_32(NumVecElts); unsigned ArithCost = 0; unsigned ShuffleCost = 0; diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 2c3576139ebf..6ef5277bc5d4 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -750,8 +750,13 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, } } else if (ST->hasVector()) { + // Vector to scalar cast. auto *SrcVecTy = cast(Src); - auto *DstVecTy = cast(Dst); + auto *DstVecTy = dyn_cast(Dst); + if (!DstVecTy) { + // TODO: tune vector-to-scalar cast. + return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I); + } unsigned VF = SrcVecTy->getNumElements(); unsigned NumDstVectors = getNumVectorRegs(Dst); unsigned NumSrcVectors = getNumVectorRegs(Src); diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll index 2df09c5f7bef..cbf04bfa8238 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-and.ll @@ -5,12 +5,12 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 364 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 455 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 637 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1001 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 362 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll index 3888495a3fc8..dba196f0f042 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-or.ll @@ -5,12 +5,12 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 129 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 364 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 455 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 637 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1001 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 91 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 362 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll b/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll index 7609deb86b84..07592b1f8d4c 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/reduce-and.ll @@ -4,13 +4,13 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll b/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll index 362efbb5615d..c78c115fe6b8 100644 --- a/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/AMDGPU/reduce-or.ll @@ -4,13 +4,13 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-and.ll b/llvm/test/Analysis/CostModel/ARM/reduce-and.ll index 26120b8657d1..2bd23dd27719 100644 --- a/llvm/test/Analysis/CostModel/ARM/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-and.ll @@ -4,13 +4,13 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 385 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-or.ll b/llvm/test/Analysis/CostModel/ARM/reduce-or.ll index 2027e6f16d58..c29f0cfd609d 100644 --- a/llvm/test/Analysis/CostModel/ARM/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-or.ll @@ -4,13 +4,13 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 391 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 488 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 682 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1070 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 385 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll b/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll index 15f697fd1007..68768258bc2f 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/reduce-and.ll @@ -4,13 +4,13 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 386 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll b/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll index b37396fd1e6b..1ccae8185b3d 100644 --- a/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/PowerPC/reduce-or.ll @@ -4,13 +4,13 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 97 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 193 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 386 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-and.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-and.ll index d5aae4153b25..1405464af6a5 100644 --- a/llvm/test/Analysis/CostModel/RISCV/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/RISCV/reduce-and.ll @@ -1,18 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=riscv32 -cost-model -cost-kind=throughput -analyze | FileCheck %s -; RUN: opt < %s -mtriple=riscv64 -cost-model -cost-kind=throughput -analyze | FileCheck %s +; RUN: opt < %s -mtriple=riscv32 -cost-model -cost-kind=throughput -analyze | FileCheck %s --check-prefix=RISCV32 +; RUN: opt < %s -mtriple=riscv64 -cost-model -cost-kind=throughput -analyze | FileCheck %s --check-prefix=RISCV64 define i32 @reduce_i1(i32 %arg) { -; CHECK-LABEL: 'reduce_i1' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; RISCV32-LABEL: 'reduce_i1' +; RISCV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; RISCV64-LABEL: 'reduce_i1' +; RISCV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-or.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-or.ll index 3f14f265c190..9d675de362c7 100644 --- a/llvm/test/Analysis/CostModel/RISCV/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/RISCV/reduce-or.ll @@ -1,18 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py -; RUN: opt < %s -mtriple=riscv32 -cost-model -cost-kind=throughput -analyze | FileCheck %s -; RUN: opt < %s -mtriple=riscv64 -cost-model -cost-kind=throughput -analyze | FileCheck %s +; RUN: opt < %s -mtriple=riscv32 -cost-model -cost-kind=throughput -analyze | FileCheck %s --check-prefix=RISCV32 +; RUN: opt < %s -mtriple=riscv64 -cost-model -cost-kind=throughput -analyze | FileCheck %s --check-prefix=RISCV64 define i32 @reduce_i1(i32 %arg) { -; CHECK-LABEL: 'reduce_i1' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 190 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 382 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; RISCV32-LABEL: 'reduce_i1' +; RISCV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; RISCV32-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef +; +; RISCV64-LABEL: 'reduce_i1' +; RISCV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; RISCV64-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-and.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-and.ll index fccd496058ec..4eab88794f83 100644 --- a/llvm/test/Analysis/CostModel/SystemZ/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/SystemZ/reduce-and.ll @@ -4,13 +4,13 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i1 @llvm.vector.reduce.and.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64 = call i1 @llvm.vector.reduce.and.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i1 @llvm.vector.reduce.and.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.and.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-or.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-or.ll index 18d0f2b838d8..700e4d6a8f8a 100644 --- a/llvm/test/Analysis/CostModel/SystemZ/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/SystemZ/reduce-or.ll @@ -4,13 +4,13 @@ define i32 @reduce_i1(i32 %arg) { ; CHECK-LABEL: 'reduce_i1' ; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16 = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %V64 = call i1 @llvm.vector.reduce.or.v64i1(<64 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 258 for instruction: %V128 = call i1 @llvm.vector.reduce.or.v128i1(<128 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %V1 = call i1 @llvm.vector.reduce.or.v1i1(<1 x i1> undef) -- GitLab From 6c52d4fd4c24a0cf738e44516ca8378d65dcf019 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 19 Mar 2021 11:11:59 -0700 Subject: [PATCH 0178/1000] [lldb] Make the API, Shell and Unit tests independent lit test suites Make the API, Shell and Unit tests independent lit test suites. This allows us to specify different dependencies and skip rebuilding all the unit test (which is particularly expensive) when running check-lldb-api or check-lldb-shell. This does not change the autogenerated targets such as check-lldb-shell-driver or the top level check-lldb target, which all continue to work as before. Differential revision: https://reviews.llvm.org/D98842 --- lldb/test/API/CMakeLists.txt | 27 +++++++++++++++------------ lldb/test/API/lit.cfg.py | 5 +++-- lldb/test/API/lit.site.cfg.py.in | 1 - lldb/test/CMakeLists.txt | 18 ++++++------------ lldb/test/Shell/CMakeLists.txt | 19 +++++++++++-------- lldb/test/Unit/CMakeLists.txt | 12 +++++++++++- lldb/unittests/CMakeLists.txt | 3 ++- 7 files changed, 48 insertions(+), 37 deletions(-) diff --git a/lldb/test/API/CMakeLists.txt b/lldb/test/API/CMakeLists.txt index 0dbc46defc81..2b7dba456b1a 100644 --- a/lldb/test/API/CMakeLists.txt +++ b/lldb/test/API/CMakeLists.txt @@ -1,3 +1,10 @@ +add_custom_target(lldb-api-test-deps) +add_dependencies(lldb-api-test-deps lldb-test-deps) + +add_lit_testsuites(LLDB-API + ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS lldb-api-test-deps) + function(add_python_test_target name test_script args comment) set(PYTHON_TEST_COMMAND ${Python3_EXECUTABLE} @@ -153,39 +160,35 @@ string(REPLACE ${CMAKE_CFG_INTDIR} ${dotest_args_replacement} LLDB_TEST_EXECUTAB string(REPLACE ${CMAKE_CFG_INTDIR} ${dotest_args_replacement} LLDB_TEST_COMPILER "${LLDB_TEST_COMPILER}") string(REPLACE ${CMAKE_CFG_INTDIR} ${dotest_args_replacement} LLDB_TEST_DSYMUTIL "${LLDB_TEST_DSYMUTIL}") -# Configure the API test suite. configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py MAIN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py) -if (CMAKE_GENERATOR STREQUAL "Xcode") - # Xcode does not get the auto-generated targets. We need to create - # check-lldb-api manually. - add_lit_testsuite(check-lldb-api "Running lldb api test suite" - ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS lldb-test-deps) -endif() - # Targets for running the test suite on the different Apple simulators. add_lit_testsuite(check-lldb-simulator-ios "Running lldb test suite on the iOS simulator" ${CMAKE_CURRENT_BINARY_DIR} PARAMS "lldb-run-with-simulator=ios" EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-test-deps) + DEPENDS lldb-api-test-deps) add_lit_testsuite(check-lldb-simulator-watchos "Running lldb test suite on the watchOS simulator" ${CMAKE_CURRENT_BINARY_DIR} PARAMS "lldb-run-with-simulator=watchos" EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-test-deps) + DEPENDS lldb-api-test-deps) add_lit_testsuite(check-lldb-simulator-tvos "Running lldb test suite on the tvOS simulator" ${CMAKE_CURRENT_BINARY_DIR} PARAMS "lldb-run-with-simulator=tvos" EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-test-deps) + DEPENDS lldb-api-test-deps) + +add_lit_testsuite(check-lldb-api "Running lldb api test suite" + ${CMAKE_CURRENT_BINARY_DIR} + EXCLUDE_FROM_CHECK_ALL + DEPENDS lldb-api-test-deps) diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index 54a02453b174..1bd7dc35fb2a 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -17,9 +17,10 @@ config.name = 'lldb-api' config.suffixes = ['.py'] # test_source_root: The root path where tests are located. -# test_exec_root: The root path where tests should be run. config.test_source_root = os.path.dirname(__file__) -config.test_exec_root = config.test_source_root + +# test_exec_root: The root path where tests should be run. +config.test_exec_root = os.path.join(config.lldb_obj_root, 'test') def mkdir_p(path): diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in index 2e368325a9f0..49ea94aacd11 100644 --- a/lldb/test/API/lit.site.cfg.py.in +++ b/lldb/test/API/lit.site.cfg.py.in @@ -1,6 +1,5 @@ @LIT_SITE_CFG_IN_HEADER@ -config.test_exec_root = "@LLDB_BINARY_DIR@" config.llvm_src_root = "@LLVM_SOURCE_DIR@" config.llvm_obj_root = "@LLVM_BINARY_DIR@" config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index 8363bde23035..c6b01c66a0ef 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -185,19 +185,13 @@ configure_lit_site_cfg( MAIN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py) -add_lit_testsuites(LLDB - ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS lldb-test-deps) - -add_lit_testsuite(check-lldb-lit "Running lldb lit test suite" +add_lit_testsuite(check-lldb "Running lldb lit test suite" ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS lldb-test-deps) -set_target_properties(check-lldb-lit PROPERTIES FOLDER "lldb tests") - -add_custom_target(check-lldb) -add_dependencies(check-lldb lldb-test-deps) -set_target_properties(check-lldb PROPERTIES FOLDER "lldb misc") -add_dependencies(check-lldb check-lldb-lit) + DEPENDS + lldb-api-test-deps + lldb-shell-test-deps + lldb-unit-test-deps) +set_target_properties(check-lldb PROPERTIES FOLDER "lldb tests") # Add a lit test suite that runs the API & shell test while capturing a # reproducer. diff --git a/lldb/test/Shell/CMakeLists.txt b/lldb/test/Shell/CMakeLists.txt index d203f1e093c7..f0d7b9a34651 100644 --- a/lldb/test/Shell/CMakeLists.txt +++ b/lldb/test/Shell/CMakeLists.txt @@ -1,4 +1,10 @@ -# Configure the Shell test suite. +add_custom_target(lldb-shell-test-deps) +add_dependencies(lldb-shell-test-deps lldb-test-deps) + +add_lit_testsuites(LLDB-SHELL + ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS lldb-shell-test-deps) + configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py @@ -8,10 +14,7 @@ configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/lit-lldb-init.in ${CMAKE_CURRENT_BINARY_DIR}/lit-lldb-init) -if (CMAKE_GENERATOR STREQUAL "Xcode") - # Xcode does not get the auto-generated targets. We need to create - # check-lldb-shell manually. - add_lit_testsuite(check-lldb-shell "Running lldb shell test suite" - ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS lldb-test-deps) -endif() +add_lit_testsuite(check-lldb-shell "Running lldb shell test suite" + ${CMAKE_CURRENT_BINARY_DIR} + EXCLUDE_FROM_CHECK_ALL + DEPENDS lldb-shell-test-deps) diff --git a/lldb/test/Unit/CMakeLists.txt b/lldb/test/Unit/CMakeLists.txt index e9b3d9e35d74..3233c0873c1f 100644 --- a/lldb/test/Unit/CMakeLists.txt +++ b/lldb/test/Unit/CMakeLists.txt @@ -1,7 +1,17 @@ -# Configure the Unit test suite. +add_custom_target(lldb-unit-test-deps) +add_dependencies(lldb-unit-test-deps lldb-test-deps) + +add_lit_testsuites(LLDB-UNIT + ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS lldb-unit-test-deps) + configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py MAIN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py) +add_lit_testsuite(check-lldb-unit "Running lldb unit test suite" + ${CMAKE_CURRENT_BINARY_DIR} + EXCLUDE_FROM_CHECK_ALL + DEPENDS lldb-unit-test-deps) diff --git a/lldb/unittests/CMakeLists.txt b/lldb/unittests/CMakeLists.txt index 37a5f972cdec..e7b0f1c17d6d 100644 --- a/lldb/unittests/CMakeLists.txt +++ b/lldb/unittests/CMakeLists.txt @@ -1,6 +1,7 @@ add_custom_target(LLDBUnitTests) set_target_properties(LLDBUnitTests PROPERTIES FOLDER "lldb tests") -add_dependencies(lldb-test-deps LLDBUnitTests) + +add_dependencies(lldb-unit-test-deps LLDBUnitTests) include_directories(${LLDB_SOURCE_ROOT}) include_directories(${LLDB_PROJECT_ROOT}/unittests) -- GitLab From 66f340051ac2d334f30ef85251323b12cb2e6e5f Mon Sep 17 00:00:00 2001 From: Vy Nguyen Date: Thu, 18 Mar 2021 18:49:45 -0400 Subject: [PATCH 0179/1000] [lld-macho] Define __mh_*_header synthetic symbols. Bug: https://bugs.llvm.org/show_bug.cgi?id=49290 Differential Revision: https://reviews.llvm.org/D97007 --- lld/MachO/Driver.cpp | 9 +---- lld/MachO/SymbolTable.cpp | 8 ++-- lld/MachO/SymbolTable.h | 4 +- lld/MachO/Symbols.h | 6 +-- lld/MachO/SyntheticSections.cpp | 62 +++++++++++++++++++++++++++++- lld/MachO/SyntheticSections.h | 2 + lld/MachO/Writer.cpp | 1 - lld/test/MachO/export-trie.s | 13 +++++-- lld/test/MachO/map-file.s | 1 + lld/test/MachO/mh-execute-header.s | 16 ++++++++ lld/test/MachO/mh-header-link.s | 43 +++++++++++++++++++++ lld/test/MachO/objc.s | 2 +- lld/test/MachO/stabs.s | 1 + lld/test/MachO/symtab.s | 14 ++++++- 14 files changed, 156 insertions(+), 26 deletions(-) create mode 100644 lld/test/MachO/mh-execute-header.s create mode 100644 lld/test/MachO/mh-header-link.s diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp index c85b72564213..341ddaf870a6 100644 --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -1043,14 +1043,7 @@ bool macho::link(ArrayRef argsArr, bool canExitEarly, } createSyntheticSections(); - - // The Itanium C++ ABI requires dylibs to pass a pointer to __cxa_atexit - // which does e.g. cleanup of static global variables. The ABI document says - // that the pointer can point to any address in one of the dylib's segments, - // but in practice ld64 seems to set it to point to the header, so that's - // what's implemented here. - symtab->addSynthetic("___dso_handle", in.header->isec, 0, - /*privateExtern=*/true, /*linkerInternal=*/true); + createSyntheticSymbols(); for (const Arg *arg : args.filtered(OPT_sectcreate)) { StringRef segName = arg->getValue(0); diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp index 1d812538005d..311c0018379d 100644 --- a/lld/MachO/SymbolTable.cpp +++ b/lld/MachO/SymbolTable.cpp @@ -159,10 +159,10 @@ Symbol *SymbolTable::addLazy(StringRef name, ArchiveFile *file, Defined *SymbolTable::addSynthetic(StringRef name, InputSection *isec, uint32_t value, bool isPrivateExtern, - bool isLinkerInternal) { - Defined *s = addDefined(name, nullptr, isec, value, /*isWeakDef=*/false, - isPrivateExtern); - s->linkerInternal = isLinkerInternal; + bool includeInSymtab) { + Defined *s = addDefined(name, nullptr, isec, value, + /*isWeakDef=*/false, isPrivateExtern); + s->includeInSymtab = includeInSymtab; return s; } diff --git a/lld/MachO/SymbolTable.h b/lld/MachO/SymbolTable.h index 8964713c7a74..9aed8c90064f 100644 --- a/lld/MachO/SymbolTable.h +++ b/lld/MachO/SymbolTable.h @@ -9,6 +9,8 @@ #ifndef LLD_MACHO_SYMBOL_TABLE_H #define LLD_MACHO_SYMBOL_TABLE_H +#include "Symbols.h" + #include "lld/Common/LLVM.h" #include "llvm/ADT/CachedHashString.h" #include "llvm/ADT/DenseMap.h" @@ -50,7 +52,7 @@ public: const llvm::object::Archive::Symbol &sym); Defined *addSynthetic(StringRef name, InputSection *, uint32_t value, - bool isPrivateExtern, bool isLinkerInternal); + bool isPrivateExtern, bool includeInSymtab); ArrayRef getSymbols() const { return symVector; } Symbol *find(llvm::CachedHashStringRef name); diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h index ada5bc164c82..e815b7de9c20 100644 --- a/lld/MachO/Symbols.h +++ b/lld/MachO/Symbols.h @@ -99,7 +99,7 @@ public: bool isWeakDef, bool isExternal, bool isPrivateExtern) : Symbol(DefinedKind, name, file), isec(isec), value(value), overridesWeakDef(false), privateExtern(isPrivateExtern), - linkerInternal(false), weakDef(isWeakDef), external(isExternal) {} + includeInSymtab(true), weakDef(isWeakDef), external(isExternal) {} bool isWeakDef() const override { return weakDef; } bool isExternalWeakDef() const { @@ -124,8 +124,8 @@ public: bool overridesWeakDef : 1; // Whether this symbol should appear in the output binary's export trie. bool privateExtern : 1; - // Whether this symbol should appear in the output binary's symbol table. - bool linkerInternal : 1; + // Whether this symbol should appear in the output symbol table. + bool includeInSymtab : 1; private: const bool weakDef : 1; diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index 1834a48f0e56..cf7d40398e62 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -53,7 +53,12 @@ SyntheticSection::SyntheticSection(const char *segname, const char *name) // dyld3's MachOLoaded::getSlide() assumes that the __TEXT segment starts // from the beginning of the file (i.e. the header). MachHeaderSection::MachHeaderSection() - : SyntheticSection(segment_names::text, section_names::header) {} + : SyntheticSection(segment_names::text, section_names::header) { + // XXX: This is a hack. (See D97007) + // Setting the index to 1 to pretend that this section is the text + // section. + index = 1; +} void MachHeaderSection::addLoadCommand(LoadCommand *lc) { loadCommands.push_back(lc); @@ -754,7 +759,7 @@ void SymtabSection::finalizeContents() { for (Symbol *sym : symtab->getSymbols()) { if (auto *defined = dyn_cast(sym)) { - if (defined->linkerInternal) + if (!defined->includeInSymtab) continue; assert(defined->isExternal()); addSymbol(externalSymbols, defined); @@ -993,3 +998,56 @@ void CodeSignatureSection::writeTo(uint8_t *buf) const { memcpy(id, fileName.begin(), fileName.size()); memset(id + fileName.size(), 0, fileNamePad); } + +void macho::createSyntheticSymbols() { + auto addHeaderSymbol = [](const char *name) { + symtab->addSynthetic(name, in.header->isec, 0, + /*privateExtern=*/true, + /*includeInSymtab*/ false); + }; + + switch (config->outputType) { + // FIXME: Assign the right addresse value for these symbols + // (rather than 0). But we need to do that after assignAddresses(). + case MH_EXECUTE: + // If linking PIE, __mh_execute_header is a defined symbol in + // __TEXT, __text) + // Otherwise, it's an absolute symbol. + if (config->isPic) + symtab->addSynthetic("__mh_execute_header", in.header->isec, 0, + /*privateExtern*/ false, + /*includeInSymbtab*/ true); + else + symtab->addSynthetic("__mh_execute_header", + /*isec*/ nullptr, 0, + /*privateExtern*/ false, + /*includeInSymbtab*/ true); + break; + + // The following symbols are N_SECT symbols, even though the header is not + // part of any section and that they are private to the bundle/dylib/object + // they are part of. + case MH_BUNDLE: + addHeaderSymbol("__mh_bundle_header"); + break; + case MH_DYLIB: + addHeaderSymbol("__mh_dylib_header"); + break; + case MH_DYLINKER: + addHeaderSymbol("__mh_dylinker_header"); + break; + case MH_OBJECT: + addHeaderSymbol("__mh_object_header"); + break; + default: + llvm_unreachable("unexpected outputType"); + break; + } + + // The Itanium C++ ABI requires dylibs to pass a pointer to __cxa_atexit + // which does e.g. cleanup of static global variables. The ABI document + // says that the pointer can point to any address in one of the dylib's + // segments, but in practice ld64 seems to set it to point to the header, + // so that's what's implemented here. + addHeaderSymbol("___dso_handle"); +} diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h index 74a6d3c9475c..92869476390a 100644 --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -500,6 +500,8 @@ struct InStruct { extern InStruct in; extern std::vector syntheticSections; +void createSyntheticSymbols(); + } // namespace macho } // namespace lld diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp index b2d316355807..4070a2077937 100644 --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -751,7 +751,6 @@ static void sortSegmentsAndSections() { // output section indices. if (!osec->isHidden()) osec->index = ++sectionIndex; - if (!firstTLVDataSection && isThreadLocalData(osec->flags)) firstTLVDataSection = osec; diff --git a/lld/test/MachO/export-trie.s b/lld/test/MachO/export-trie.s index c479e166561c..cab8312fe1d0 100644 --- a/lld/test/MachO/export-trie.s +++ b/lld/test/MachO/export-trie.s @@ -15,7 +15,9 @@ # EXPORTS-DAG: [[#%x, HELLO_WORLD_ADDR:]] {{.*}} _hello_world # EXPORTS-DAG: [[#%x, HELLO_ITS_ME_ADDR:]] {{.*}} _hello_its_me # EXPORTS-DAG: [[#%x, HELLO_ITS_YOU_ADDR:]] {{.*}} _hello_its_you +# EXPORTS-DAG: {{0+}} g *ABS* __mh_execute_header # EXPORTS-LABEL: Exports trie: +# EXPORTS-DAG: 0x{{0+}} __mh_execute_header [absolute] # EXPORTS-DAG: 0x{{0*}}[[#%X, MAIN_ADDR]] _main # EXPORTS-DAG: 0x{{0*}}[[#%X, HELLO_ADDR]] _hello # EXPORTS-DAG: 0x{{0*}}[[#%X, HELLO_WORLD_ADDR]] _hello_world @@ -27,13 +29,16 @@ # CHECK-LABEL: ExportTrie: # CHECK: Name: '' # CHECK: Name: _ -# CHECK: Name: main -# CHECK: Name: hello +# CHECK-DAG: Name: _mh_execute_header +# CHECK-DAG: Name: main +# CHECK-DAG: Name: hello # CHECK: Name: _ # CHECK: Name: world # CHECK: Name: its_ -# CHECK: Name: you -# CHECK: Name: me +# CHECK-DAG: Name: you +# CHECK-DAG: Name: me + + .section __TEXT,__cstring .globl _hello, _hello_world, _hello_its_me, _hello_its_you, _main diff --git a/lld/test/MachO/map-file.s b/lld/test/MachO/map-file.s index ac5fb93898d3..215d193aa622 100644 --- a/lld/test/MachO/map-file.s +++ b/lld/test/MachO/map-file.s @@ -29,6 +29,7 @@ _main: # CHECK-NEXT: [[#%x,MAIN:]] g F __TEXT,__text _main # CHECK-NEXT: [[#%x,NUMBER:]] g O __DATA,__common _number # CHECK-NEXT: [[#%x,FOO:]] g O __TEXT,obj _foo +# CHECK-NEXT: {{0+}} g *ABS* __mh_execute_header # CHECK-NEXT: # Path: {{.*}}{{/|\\}}map-file.s.tmp/test-map # CHECK-NEXT: # Arch: x86_64 diff --git a/lld/test/MachO/mh-execute-header.s b/lld/test/MachO/mh-execute-header.s new file mode 100644 index 000000000000..4a62b27bb8fa --- /dev/null +++ b/lld/test/MachO/mh-execute-header.s @@ -0,0 +1,16 @@ +# REQUIRES: x86 +# RUN: rm -rf %t; mkdir %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/test.o +# RUN: %lld -o %t/test.pie %t/test.o -pie +# RUN: llvm-objdump --macho --syms %t/test.pie | FileCheck %s --check-prefix=PIE + +# RUN: %lld -o %t/test.no_pie %t/test.o -no_pie +# RUN: llvm-objdump --macho --syms %t/test.no_pie | FileCheck %s --check-prefix=NO-PIE + +# PIE: 0000000100000000 g F __TEXT,__text __mh_execute_header +# NO-PIE: 0000000000000000 g *ABS* __mh_execute_header + +.text +.global _main +_main: + ret diff --git a/lld/test/MachO/mh-header-link.s b/lld/test/MachO/mh-header-link.s new file mode 100644 index 000000000000..0813435397b7 --- /dev/null +++ b/lld/test/MachO/mh-header-link.s @@ -0,0 +1,43 @@ +# REQUIRES: x86 + +## This tests that we can link against these synthetic symbols even +## if they are not in the symbol table. + +# RUN: rm -rf %t; split-file %s %t + +## Test that in a dylib, we can link against __mh_dylib_header +## (but not in other types of files) + +# RUN: llvm-mc %t/dylib.s -triple=x86_64-apple-macos11.0 -filetype=obj -o %t/dylib.o +# RUN: %lld -pie -dylib %t/dylib.o -o %t/dylib.out +# RUN: llvm-objdump -m --syms %t/dylib.out | FileCheck %s --check-prefix DYLIB + +# RUN: not %lld -pie -o /dev/null %t/dylib.o 2>&1 | FileCheck %s --check-prefix ERR-DYLIB + +# DYLIB: SYMBOL TABLE: +# DYLIB-NEXT: {{[0-9a-f]+}} g F __TEXT,__text _main +# DYLIB-NEXT-EMPTY: +# ERR-DYLIB: error: undefined symbol: __mh_dylib_header + +## Test that in an executable, we can link against __mh_execute_header +# RUN: llvm-mc %t/main.s -triple=x86_64-apple-macos11.0 -filetype=obj -o %t/exec.o +# RUN: %lld -pie %t/exec.o -o %t/exec.out + +## But it would be an error trying to reference __mh_execute_header in a dylib +# RUN: not %lld -pie -o /dev/null -dylib %t/exec.o 2>&1 | FileCheck %s --check-prefix ERR-EXEC + +# ERR-EXEC: error: undefined symbol: __mh_execute_header + +#--- main.s +.text +.globl _main +_main: + mov __mh_execute_header@GOTPCREL(%rip), %rax + ret + +#--- dylib.s +.text +.globl _main +_main: + mov __mh_dylib_header@GOTPCREL(%rip), %rax + ret diff --git a/lld/test/MachO/objc.s b/lld/test/MachO/objc.s index 53dd12e8f190..06f47d2c3b78 100644 --- a/lld/test/MachO/objc.s +++ b/lld/test/MachO/objc.s @@ -32,7 +32,7 @@ # NO-OBJC-EMPTY: # NO-OBJC-NEXT: SYMBOL TABLE: # NO-OBJC-NEXT: g F __TEXT,__text _main -# NO-OBJC-EMPTY: +# NO_OBJC-NEXT: g *ABS* __mh_execute_header #--- has-objc-symbol.s .globl _OBJC_CLASS_$_MyObject diff --git a/lld/test/MachO/stabs.s b/lld/test/MachO/stabs.s index 09735feb5b26..6a72ff246e1d 100644 --- a/lld/test/MachO/stabs.s +++ b/lld/test/MachO/stabs.s @@ -60,6 +60,7 @@ # CHECK-NEXT: [[#ZERO]] S _zero # CHECK-NEXT: [[#FOO]] T _foo # CHECK-NEXT: {{[0-9af]+}} T _no_debug +# CHECK-NEXT: {{0+}} A __mh_execute_header # CHECK-EMPTY: ## Check that we don't attempt to emit rebase opcodes for the debug sections diff --git a/lld/test/MachO/symtab.s b/lld/test/MachO/symtab.s index 44d42068ffb5..35fc961d1b35 100644 --- a/lld/test/MachO/symtab.s +++ b/lld/test/MachO/symtab.s @@ -56,6 +56,16 @@ # CHECK-NEXT: ] # CHECK-NEXT: Value: 0x1{{[0-9a-f]*}} # CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: __mh_execute_header (81) +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Abs (0x2) +# CHECK-NEXT: Section: (0x0) +# CHECK-NEXT: RefType: UndefinedNonLazy (0x0) +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: 0x0 +# CHECK-NEXT: } # CHECK-NEXT: Symbol { # CHECK-NEXT: Name: dyld_stub_binder # CHECK-NEXT: Extern @@ -82,8 +92,8 @@ # CHECK-NEXT: ilocalsym: 0 # CHECK-NEXT: nlocalsym: 2 # CHECK-NEXT: iextdefsym: 2 -# CHECK-NEXT: nextdefsym: 3 -# CHECK-NEXT: iundefsym: 5 +# CHECK-NEXT: nextdefsym: 4 +# CHECK-NEXT: iundefsym: 6 # CHECK-NEXT: nundefsym: 2 ## Verify that the first entry in the StringTable is a space, and that -- GitLab From 5698537f81a2ecf1166f41cab264b92af670aaa1 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Fri, 19 Mar 2021 11:15:29 -0700 Subject: [PATCH 0180/1000] Update basic deref API to account for possiblity of free [NFC] This patch is plumbing to support work towards the goal outlined in the recent llvm-dev post "[llvm-dev] RFC: Decomposing deref(N) into deref(N) + nofree". The point of this change is purely to simplify iteration on other pieces on way to making the switch. Rebuilding with a change to Value.h is slow and painful, so I want to get the API change landed. Once that's done, I plan to more closely audit each caller, add the inference rules in their own patch, then post a patch with the langref changes and test diffs. The value of the command line flag is that we can exercise the inference logic in standalone patches without needing the whole switch ready to go just yet. Differential Revision: https://reviews.llvm.org/D98908 --- llvm/include/llvm/Analysis/MemoryBuiltins.h | 4 ++++ llvm/include/llvm/IR/Value.h | 7 ++++++- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 6 ++++-- llvm/lib/Analysis/CaptureTracking.cpp | 4 ++-- llvm/lib/Analysis/Loads.cpp | 8 +++++--- llvm/lib/IR/Value.cpp | 11 ++++++++++- llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 14 ++++++++------ .../Transforms/InstCombine/InstCombineCasts.cpp | 4 ++-- 8 files changed, 41 insertions(+), 17 deletions(-) diff --git a/llvm/include/llvm/Analysis/MemoryBuiltins.h b/llvm/include/llvm/Analysis/MemoryBuiltins.h index c5428726995e..39ade20df53f 100644 --- a/llvm/include/llvm/Analysis/MemoryBuiltins.h +++ b/llvm/include/llvm/Analysis/MemoryBuiltins.h @@ -212,6 +212,10 @@ struct ObjectSizeOpts { /// object size in Size if successful, and false otherwise. In this context, by /// object we mean the region of memory starting at Ptr to the end of the /// underlying object pointed to by Ptr. +/// +/// WARNING: The object size returned is the allocation size. This does not +/// imply dereferenceability at site of use since the object may be freeed in +/// between. bool getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL, const TargetLibraryInfo *TLI, ObjectSizeOpts Opts = {}); diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h index 5a7e90aeb0f6..e9a9acfd69ba 100644 --- a/llvm/include/llvm/IR/Value.h +++ b/llvm/include/llvm/IR/Value.h @@ -743,8 +743,13 @@ public: /// /// If CanBeNull is set by this function the pointer can either be null or be /// dereferenceable up to the returned number of bytes. + /// + /// IF CanBeFreed is true, the pointer is known to be dereferenceable at + /// point of definition only. Caller must prove that allocation is not + /// deallocated between point of definition and use. uint64_t getPointerDereferenceableBytes(const DataLayout &DL, - bool &CanBeNull) const; + bool &CanBeNull, + bool &CanBeFreed) const; /// Returns an alignment of the pointer value. /// diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index 11fa4d2893e6..a8c5b9ca80e4 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -199,9 +199,11 @@ static uint64_t getMinimalExtentFrom(const Value &V, // If we have dereferenceability information we know a lower bound for the // extent as accesses for a lower offset would be valid. We need to exclude // the "or null" part if null is a valid pointer. - bool CanBeNull; - uint64_t DerefBytes = V.getPointerDereferenceableBytes(DL, CanBeNull); + bool CanBeNull, CanBeFreed; + uint64_t DerefBytes = + V.getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); DerefBytes = (CanBeNull && NullIsValidLoc) ? 0 : DerefBytes; + DerefBytes = CanBeFreed ? 0 : DerefBytes; // If queried with a precise location size, we assume that location size to be // accessed, thus valid. if (LocSize.isPrecise()) diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index b2fc6e603f9e..cf5e53b26b5d 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -68,8 +68,8 @@ bool CaptureTracker::isDereferenceableOrNull(Value *O, const DataLayout &DL) { if (auto *GEP = dyn_cast(O)) if (GEP->isInBounds()) return true; - bool CanBeNull; - return O->getPointerDereferenceableBytes(DL, CanBeNull); + bool CanBeNull, CanBeFreed; + return O->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); } namespace { diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 88e4c723331a..7279ed59c440 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -67,10 +67,12 @@ static bool isDereferenceableAndAlignedPointer( Visited, MaxDepth); } - bool CheckForNonNull = false; + bool CheckForNonNull, CheckForFreed; APInt KnownDerefBytes(Size.getBitWidth(), - V->getPointerDereferenceableBytes(DL, CheckForNonNull)); - if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size)) + V->getPointerDereferenceableBytes(DL, CheckForNonNull, + CheckForFreed)); + if (KnownDerefBytes.getBoolValue() && KnownDerefBytes.uge(Size) && + !CheckForFreed) if (!CheckForNonNull || isKnownNonZero(V, DL, 0, nullptr, CtxI, DT)) { // As we recursed through GEPs to get here, we've incrementally checked // that each step advanced by a multiple of the alignment. If our base is diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp index 92ffae18ae6f..cfb91b55f707 100644 --- a/llvm/lib/IR/Value.cpp +++ b/llvm/lib/IR/Value.cpp @@ -38,6 +38,11 @@ using namespace llvm; +static cl::opt UseDerefAtPointSemantics( + "use-dereferenceable-at-point-semantics", cl::Hidden, cl::init(false), + cl::desc("Deref attributes and metadata infer facts at definition only")); + + static cl::opt NonGlobalValueMaxNameSize( "non-global-value-max-name-size", cl::Hidden, cl::init(1024), cl::desc("Maximum size for the name of non-global values.")); @@ -724,11 +729,13 @@ Value::stripInBoundsOffsets(function_ref Func) const { } uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL, - bool &CanBeNull) const { + bool &CanBeNull, + bool &CanBeFreed) const { assert(getType()->isPointerTy() && "must be pointer"); uint64_t DerefBytes = 0; CanBeNull = false; + CanBeFreed = UseDerefAtPointSemantics; if (const Argument *A = dyn_cast(this)) { DerefBytes = A->getDereferenceableBytes(); if (DerefBytes == 0) { @@ -783,6 +790,7 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL, DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType()).getKnownMinSize(); CanBeNull = false; + CanBeFreed = false; } } else if (auto *GV = dyn_cast(this)) { if (GV->getValueType()->isSized() && !GV->hasExternalWeakLinkage()) { @@ -790,6 +798,7 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL, // CanBeNull flag. DerefBytes = DL.getTypeStoreSize(GV->getValueType()).getFixedSize(); CanBeNull = false; + CanBeFreed = false; } } return DerefBytes; diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 21fa11aadea8..fa32a22059ac 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -1750,8 +1750,9 @@ struct AANonNullImpl : AANonNull { AANonNull::initialize(A); - bool CanBeNull = true; - if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull)) { + bool CanBeNull, CanBeFreed; + if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull, + CanBeFreed)) { if (!CanBeNull) { indicateOptimisticFixpoint(); return; @@ -3548,10 +3549,10 @@ struct AADereferenceableImpl : AADereferenceable { const IRPosition &IRP = this->getIRPosition(); NonNullAA = &A.getAAFor(*this, IRP, DepClassTy::NONE); - bool CanBeNull; + bool CanBeNull, CanBeFreed; takeKnownDerefBytesMaximum( IRP.getAssociatedValue().getPointerDereferenceableBytes( - A.getDataLayout(), CanBeNull)); + A.getDataLayout(), CanBeNull, CanBeFreed)); bool IsFnInterface = IRP.isFnInterfaceKind(); Function *FnScope = IRP.getAnchorScope(); @@ -3661,8 +3662,9 @@ struct AADereferenceableFloating : AADereferenceableImpl { if (!Stripped && this == &AA) { // Use IR information if we did not strip anything. // TODO: track globally. - bool CanBeNull; - DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull); + bool CanBeNull, CanBeFreed; + DerefBytes = + Base->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed); T.GlobalState.indicatePessimisticFixpoint(); } else { const DerefState &DS = AA.getState(); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index e6e90b915bb8..ae72123e3f00 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -2593,8 +2593,8 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { // If the source pointer is dereferenceable, then assume it points to an // allocated object and apply "inbounds" to the GEP. - bool CanBeNull; - if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) { + bool CanBeNull, CanBeFreed; + if (Src->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed)) { // In a non-default address space (not 0), a null pointer can not be // assumed inbounds, so ignore that case (dereferenceable_or_null). // The reason is that 'null' is not treated differently in these address -- GitLab From 6c1ae8f2dc374a532f6345d776082eb5b9aaa247 Mon Sep 17 00:00:00 2001 From: Vy Nguyen Date: Fri, 19 Mar 2021 14:17:51 -0400 Subject: [PATCH 0181/1000] [lld-macho][nfc] Fixed typo in comment Missed this one from https://reviews.llvm.org/D97007?id=331759#inline-930034 Differential Revision: https://reviews.llvm.org/D98973 --- lld/MachO/SyntheticSections.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp index cf7d40398e62..623307971ef7 100644 --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -1007,7 +1007,7 @@ void macho::createSyntheticSymbols() { }; switch (config->outputType) { - // FIXME: Assign the right addresse value for these symbols + // FIXME: Assign the right address value for these symbols // (rather than 0). But we need to do that after assignAddresses(). case MH_EXECUTE: // If linking PIE, __mh_execute_header is a defined symbol in -- GitLab From d4cba4a188f419d1c2fc4b827c4a6a0310b0568e Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Tue, 16 Mar 2021 18:05:19 -0700 Subject: [PATCH 0182/1000] [mlir][linalg] Add structured op builders from python opdsl. * Makes the wrapped functions of the `@linalg_structured_op` decorator callable such that they emit IR imperatively when invoked. * There are numerous TODOs that I will keep working through to achieve generality. * Will true up exception handling tests as the feature progresses (for things that are actually errors once everything is implemented). * Includes the addition of an `isinstance` method on concrete types in the Python API. Differential Revision: https://reviews.llvm.org/D98754 --- mlir/lib/Bindings/Python/IRModules.cpp | 3 + .../mlir/dialects/linalg/opdsl/lang/config.py | 17 +- .../mlir/dialects/linalg/opdsl/lang/dsl.py | 33 ++- .../dialects/linalg/opdsl/lang/emitter.py | 252 ++++++++++++++++++ .../linalg/opdsl/emit_structured_generic.py | 194 ++++++++++++++ mlir/test/Bindings/Python/ir_types.py | 15 ++ 6 files changed, 503 insertions(+), 11 deletions(-) create mode 100644 mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/emitter.py create mode 100644 mlir/test/Bindings/Python/dialects/linalg/opdsl/emit_structured_generic.py diff --git a/mlir/lib/Bindings/Python/IRModules.cpp b/mlir/lib/Bindings/Python/IRModules.cpp index a544e52c2613..6b4e5434d1d7 100644 --- a/mlir/lib/Bindings/Python/IRModules.cpp +++ b/mlir/lib/Bindings/Python/IRModules.cpp @@ -2477,6 +2477,9 @@ public: static void bind(py::module &m) { auto cls = ClassTy(m, DerivedTy::pyClassName); cls.def(py::init(), py::keep_alive<0, 1>()); + cls.def_static("isinstance", [](PyType &otherType) -> bool { + return DerivedTy::isaFunction(otherType); + }); DerivedTy::bindDerived(cls); } diff --git a/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/config.py b/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/config.py index 115ea40619b8..fdc6cfd9bab0 100644 --- a/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/config.py +++ b/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/config.py @@ -21,6 +21,7 @@ from .yaml_helper import * __all__ = [ "LinalgStructuredOpConfig", "LinalgOpConfig", + "TensorDefConfig", ] @@ -51,17 +52,17 @@ class TensorDefConfig(YAMLObject): self.shape_map = shape_map self.indexing_map = None # type: Optional[_ir.AffineMap] - def to_yaml_custom_dict(self): - - def get_usage(): - if self.tensor_def.output: - return "output" - else: - return "input" + @property + def usage(self) -> str: + if self.tensor_def.output: + return "output" + else: + return "input" + def to_yaml_custom_dict(self): return dict( name=self.tensor_def.tensor_name, - usage=get_usage(), + usage=self.usage, shape=_serialize_affine_map(self.shape_map), element_type_var=self.tensor_def.type_var.name, ) diff --git a/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/dsl.py b/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/dsl.py index d367c5bdde07..cbff41db2d88 100644 --- a/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/dsl.py +++ b/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/dsl.py @@ -11,6 +11,8 @@ import threading from mlir import ir from .comprehension import * +from .config import * +from .emitter import * _CONTEXT = threading.local() @@ -42,9 +44,34 @@ class DefinedOpCallable: self.op_name = op_name self.model = model - def __call__(self, *args, **kwargs): - # TODO: Upstream the emitter and invoke here - raise NotImplementedError("Linalg generic emission not yet implemented") + def __call__(self, *args, emit_generic: bool = True, **kwargs): + """Emits the corresponding op definition as IR. + + Most arguments are passed through to the underlying emitter. The following + are interpreted here: + emit_generic: Emits a generic form as appropriate (default True). If + False, a named form is emitted (which must have been built in to the + compiler). + """ + op_configs = LinalgOpConfig.from_linalg_op_def(self.model, + context=ir.Context.current) + + if len(op_configs) != 1: + # TODO: Support composite ops. + raise NotImplementedError( + f"Emission of composite linalg ops not supported: {op_configs}") + + op_config = op_configs[0] + if op_config.structured_op: + if emit_generic: + return emit_generic_structured_op(op_config.structured_op, *args, + **kwargs) + else: + return emit_named_structured_op(op_config.structured_op, *args, + **kwargs) + + raise NotImplementedError( + f"Emission of linalg op type not supported: {op_config}") def linalg_structured_op(dsl_func=None, diff --git a/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/emitter.py b/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/emitter.py new file mode 100644 index 000000000000..9a18993e9f62 --- /dev/null +++ b/mlir/lib/Bindings/Python/mlir/dialects/linalg/opdsl/lang/emitter.py @@ -0,0 +1,252 @@ +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from typing import Dict, Sequence + +from mlir.ir import * +from mlir.dialects import linalg +from mlir.dialects import std + +from .scalar_expr import * +from .config import * + +__all__ = [ + "emit_generic_structured_op", + "emit_named_structured_op", +] + + +def emit_generic_structured_op(op_config: LinalgStructuredOpConfig, + *ins: Value, + outs: Value = ()): + all_arg_defs = op_config.ordered_tensor_args + in_arg_defs = [arg for arg in all_arg_defs if arg.usage == "input"] + out_arg_defs = [arg for arg in all_arg_defs if arg.usage == "output"] + + # Arity validation. + if len(ins) != len(in_arg_defs): + raise ValueError(f"Expected {len(in_arg_defs)} inputs but got " + f"{len(ins)} for {op_config}") + if outs and len(outs) != len(out_arg_defs): + raise ValueError(f"Expected {len(out_arg_defs)} outputs but got " + f"{len(outs)} for {op_config}") + + outs, out_types = _infer_structured_outs(op_config, in_arg_defs, ins, + out_arg_defs, outs) + + # Extract type vars for input/output based types. + type_mapping = dict() # type: Dict[str, Type] + for arg_def, arg_element_type in zip( + in_arg_defs + out_arg_defs, + _get_shaped_element_types_from_values(*ins, *outs)): + tv_name = arg_def.tensor_def.type_var.name + type_mapping[tv_name] = arg_element_type + + # Emit the generic op. + # TODO: Support emission of pure memref form. + indexing_maps_attr = ArrayAttr.get( + [AffineMapAttr.get(am) for am in op_config.indexing_maps]) + iterator_types_attr = ArrayAttr.get( + [StringAttr.get(s) for s in op_config.iterator_types]) + generic_op = linalg.GenericOp( + result_tensors=out_types, + inputs=ins, + outputs=outs, + indexing_maps=indexing_maps_attr, + iterator_types=iterator_types_attr, + doc=None, # TODO: Make optional. + library_call=None, # TODO: Make optional. + sparse=BoolAttr.get(False)) # TODO: Make optional. + + # Construct the body. + block_arg_names = _get_tensor_def_names(*in_arg_defs, *out_arg_defs) + block_arg_types = _get_shaped_element_types_from_values(*ins, *outs) + block = generic_op.regions[0].blocks.append(*block_arg_types) + block_arg_mapping = dict(zip(block_arg_names, block.arguments)) + with InsertionPoint(block): + body_builder = _BodyBuilder(type_mapping, block_arg_mapping) + for assignment in op_config.assignments: + body_builder.assign(assignment) + body_builder.yield_outputs(*_get_tensor_def_names(*out_arg_defs)) + + if len(out_arg_defs) == 1: + return generic_op.result + else: + return generic_op.results + + +def emit_named_structured_op(op_config: LinalgStructuredOpConfig, + *ins: Value, + outs: Value = ()): + raise NotImplementedError( + f"Emission of named structured ops is not supported: {op_config}") + + +class _BodyBuilder: + """Constructs a structured op body by evaluating assignments.""" + + def __init__(self, type_mapping: Dict[str, Type], + block_arg_mapping: Dict[str, Value]): + self.type_mapping = type_mapping + self.block_arg_mapping = block_arg_mapping + self.yield_mapping = dict() # type: Dict[str, Value] + + def assign(self, assignment: ScalarAssign): + if assignment.arg in self.yield_mapping: + raise ValueError( + f"Multiple assignments to the same argument are forbidden: " + f"{assignment}") + self.yield_mapping[assignment.arg] = self.expression(assignment.value) + + def expression(self, expr: ScalarExpression) -> Value: + if expr.scalar_arg: + try: + return self.block_arg_mapping[expr.scalar_arg.arg] + except KeyError: + raise ValueError(f"Argument {expr.scalar_arg.arg} is not bound for " + f"this structured op.") + elif expr.scalar_apply: + try: + fn = getattr(self, f"_eval_{expr.scalar_apply.fn_name}") + except AttributeError: + raise ValueError( + f"Function '{expr.scalar_apply.fn_name}' is not a known " + "scalar body function") + operand_values = [ + self.expression(operand) for operand in expr.scalar_apply.operands + ] + return fn(*operand_values) + elif expr.symbolic_cast: + operand_value = self.expression(expr.symbolic_cast.operand) + return self.cast(expr.symbolic_cast.to_type.name, operand_value) + raise NotImplementedError(f"Unimplemented scalar body expression: {expr}") + + def cast(self, type_var_name: str, operand: Value) -> Value: + try: + to_type = self.type_mapping[type_var_name] + except KeyError: + raise ValueError(f"Unbound type variable '{type_var_name}' (" + f"expected one of {self.type_mappings.keys()}") + if operand.type == to_type: + return operand + if _is_integer_type(to_type): + return self._cast_to_integer(to_type, operand) + elif _is_floating_point_type(to_type): + return self._cast_to_floating_point(to_type, operand) + + raise ValueError(f"Unable to cast body expression from {operand.type} to " + f"{to_type}") + + def _cast_to_integer(self, to_type: Type, operand: Value) -> Value: + to_width = IntegerType(to_type).width + operand_type = operand.type + if _is_floating_point_type(operand_type): + return std.FPToSIOp(to_type, operand).result + # Assume integer. + from_width = IntegerType(operand_type).width + if to_width > from_width: + return std.SignExtendIOp(to_type, operand).result + elif to_width < from_width: + return std.TruncateIOp(to_type, operand).result + raise ValueError(f"Unable to cast body expression from {operand_type} to " + f"{to_type}") + + def _cast_to_floating_point(self, to_type: Type, operand: Value) -> Value: + operand_type = operand.type + if _is_integer_type(operand_type): + return std.SIToFPOp(to_type, operand).result + # Assume FloatType. + to_width = _get_floating_point_width(to_type) + from_width = _get_floating_point_width(operand_type) + if to_width > from_width: + return std.FPExtOp(to_type, operand).result + elif to_width < from_width: + return std.FPTruncOp(to_type, operand).result + raise ValueError(f"Unable to cast body expression from {operand_type} to " + f"{to_type}") + + def yield_outputs(self, *output_names: str): + output_values = [] + for n in output_names: + try: + output_values.append(self.yield_mapping[n]) + except KeyError: + raise ValueError(f"Body assignments do not assign all outputs: " + f"missing '{n}'") + linalg.YieldOp(output_values) + + def _eval_add(self, lhs: Value, rhs: Value) -> Value: + if _is_floating_point_type(lhs.type): + return std.AddFOp(lhs.type, lhs, rhs).result + if _is_integer_type(lhs.type): + return std.AddIOp(lhs.type, lhs, rhs).result + raise NotImplementedError("Unsupported 'add' operand: {lhs}") + + def _eval_mul(self, lhs: Value, rhs: Value) -> Value: + if _is_floating_point_type(lhs.type): + return std.MulFOp(lhs.type, lhs, rhs).result + if _is_integer_type(lhs.type): + return std.MulIOp(lhs.type, lhs, rhs).result + raise NotImplementedError("Unsupported 'mul' operand: {lhs}") + + +def _infer_structured_outs(op_config: LinalgStructuredOpConfig, + in_arg_defs: Sequence[TensorDefConfig], + ins: Sequence[Value], + out_arg_defs: Sequence[TensorDefConfig], + outs: Sequence[Value]): + """Infers implicit outs and output types. + + Respects existing contents of outs if not empty. + + Returns: + normalized outs, output types + """ + # If outs were explicitly provided, we accept them verbatim. + if outs: + return outs, [out.type for out in outs] + + raise NotImplementedError(f"Output tensor inference not yet supported for " + "structured ops") + + +def _get_shaped_element_types_from_values(*values: Value) -> Sequence[Type]: + types = [] + for v in values: + try: + t = ShapedType(v.type) + except Exception as e: + raise ValueError(f"Expected ShapedType but got {v}") from e + types.append(t.element_type) + return types + + +def _get_tensor_def_names( + *tensor_def_configs: TensorDefConfig) -> Sequence[str]: + return [tdc.tensor_def.tensor_name for tdc in tensor_def_configs] + + +def _is_floating_point_type(t: Type) -> bool: + # TODO: Create a FloatType in the Python API and implement the switch + # there. + return (F64Type.isinstance(t) or F32Type.isinstance(t) or + F16Type.isinstance(t) or BF16Type.isinstance(t)) + + +def _is_integer_type(t: Type) -> bool: + return IntegerType.isinstance(t) + + +def _get_floating_point_width(t: Type) -> int: + # TODO: Create a FloatType in the Python API and implement the switch + # there. + if F64Type.isinstance(t): + return 64 + if F32Type.isinstance(t): + return 32 + if F16Type.isinstance(t): + return 16 + if BF16Type.isinstance(t): + return 16 + raise NotImplementedError(f"Unhandled floating point type switch {t}") diff --git a/mlir/test/Bindings/Python/dialects/linalg/opdsl/emit_structured_generic.py b/mlir/test/Bindings/Python/dialects/linalg/opdsl/emit_structured_generic.py new file mode 100644 index 000000000000..7f8c11679457 --- /dev/null +++ b/mlir/test/Bindings/Python/dialects/linalg/opdsl/emit_structured_generic.py @@ -0,0 +1,194 @@ +# RUN: %PYTHON %s | FileCheck %s + +from typing import Optional, Sequence + +from mlir.ir import * +from mlir.dialects import builtin +from mlir.dialects import linalg +from mlir.dialects import std + +from mlir.dialects.linalg.opdsl.lang import * + + +# TODO: Find a home for this quality of life helper. +def build_function(*inputs: Type, results: Optional[Sequence[Type]] = None): + """Decorator that emits a function in a more pythonic way. + + If result types are not specified, they are inferred from the function + returns. The `ReturnOp` is implicitly added upon the wrapped function return. + """ + + def decorator(f): + return_types = results + symbol_name = f.__name__ + function_type = FunctionType.get(inputs=inputs, results=results or []) + func_op = builtin.FuncOp(name=symbol_name, type=function_type) + with InsertionPoint(func_op.add_entry_block()): + func_args = func_op.entry_block.arguments + return_values = f(*func_args) + if return_values is None: + return_values = [] + elif isinstance(return_values, Value): + return_values = [return_values] + else: + return_values = list(return_values) + std.ReturnOp(return_values) + if return_types is None: + # Recompute the function type. + return_types = [v.type for v in return_values] + function_type = FunctionType.get(inputs=inputs, results=return_types) + # TODO: Have an API or a setter for this. + func_op.attributes["type"] = TypeAttr.get(function_type) + + # TODO: When turning this into a real facility, return a function that emits + # a `call` to the function instead of doing nothing. + wrapped = lambda: None + wrapped.__name__ = symbol_name + wrapped.func_op = func_op + return wrapped + + return decorator + + +@linalg_structured_op +def matmul_mono(A=TensorDef(T, S.M, S.K), + B=TensorDef(T, S.K, S.N), + C=TensorDef(T, S.M, S.N, output=True)): + C[D.m, D.n] += A[D.m, D.k] * B[D.k, D.n] + + +@linalg_structured_op +def matmul_poly(A=TensorDef(TV.T1, S.M, S.K), + B=TensorDef(TV.T2, S.K, S.N), + C=TensorDef(U, S.M, S.N, output=True)): + C[D.m, D.n] += cast(U, A[D.m, D.k]) * cast(U, B[D.k, D.n]) + + +with Context() as ctx, Location.unknown(): + module = Module.create() + f16 = F16Type.get() + f32 = F32Type.get() + f64 = F64Type.get() + i8 = IntegerType.get_signless(8) + i16 = IntegerType.get_signless(16) + i32 = IntegerType.get_signless(32) + with InsertionPoint.at_block_terminator(module.body): + + # Note that these all have the same indexing maps. We verify the first and + # then do more permutation tests on casting and body generation + # behavior. + # CHECK: #[[$MAPA:.+]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)> + # CHECK: #[[$MAPB:.+]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)> + # CHECK: #[[$MAPC:.+]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)> + + # CHECK-LABEL: func @test_matmul_mono + # CHECK-SAME: %[[A:.+]]: tensor<4x16xf32> + # CHECK-SAME: %[[B:.+]]: tensor<16x8xf32> + + # CHECK: %[[INITC:.+]] = linalg.init_tensor [4, 8] : tensor<4x8xf32> + # CHECK: linalg.generic + # CHECK-SAME: indexing_maps = [#[[$MAPA]], #[[$MAPB]], #[[$MAPC]]] + # CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"] + # CHECK-SAME: ins(%[[A]], %[[B]] + # CHECK-SAME: outs(%[[INITC]] + + @build_function(RankedTensorType.get((4, 16), f32), + RankedTensorType.get((16, 8), f32)) + def test_matmul_mono(lhs, rhs): + # TODO: Enable outs inference and add sugar for InitTensorOp + # construction. + init_result = linalg.InitTensorOp(result=RankedTensorType.get((4, 8), + f32), + static_sizes=ArrayAttr.get([ + IntegerAttr.get(IndexType.get(), 4), + IntegerAttr.get(IndexType.get(), 8) + ]), + sizes=[]) + return matmul_mono(lhs, rhs, outs=[init_result.result]) + + # CHECK-LABEL: @test_i8i8i32_matmul + # CHECK: ^{{.*}}(%[[A_ARG:.+]]: i8, %[[B_ARG:.+]]: i8, %[[C_ARG:.+]]: i32) + # CHECK-NEXT: %[[A_CAST:.+]] = sexti %[[A_ARG]] : i8 to i32 + # CHECK-NEXT: %[[B_CAST:.+]] = sexti %[[B_ARG]] : i8 to i32 + # CHECK-NEXT: %[[MUL:.+]] = muli %[[A_CAST]], %[[B_CAST]] : i32 + # CHECK-NEXT: %[[ADD:.+]] = addi %[[C_ARG]], %[[MUL]] : i32 + # CHECK-NEXT: linalg.yield %[[ADD]] : i32 + # CHECK-NEXT: -> tensor<4x8xi32> + @build_function(RankedTensorType.get((4, 16), i8), + RankedTensorType.get((16, 8), i8), + RankedTensorType.get((4, 8), i32)) + def test_i8i8i32_matmul(lhs, rhs, init_result): + return matmul_poly(lhs, rhs, outs=[init_result]) + + # CHECK-LABEL: @test_i8i16i32_matmul + # CHECK: ^{{.*}}(%[[A_ARG:.+]]: i8, %[[B_ARG:.+]]: i16, %[[C_ARG:.+]]: i32) + # CHECK-NEXT: %[[A_CAST:.+]] = sexti %[[A_ARG]] : i8 to i32 + # CHECK-NEXT: %[[B_CAST:.+]] = sexti %[[B_ARG]] : i16 to i32 + # CHECK-NEXT: %[[MUL:.+]] = muli %[[A_CAST]], %[[B_CAST]] : i32 + # CHECK-NEXT: %[[ADD:.+]] = addi %[[C_ARG]], %[[MUL]] : i32 + # CHECK-NEXT: linalg.yield %[[ADD]] : i32 + # CHECK-NEXT: -> tensor<4x8xi32> + @build_function(RankedTensorType.get((4, 16), i8), + RankedTensorType.get((16, 8), i16), + RankedTensorType.get((4, 8), i32)) + def test_i8i16i32_matmul(lhs, rhs, init_result): + return matmul_poly(lhs, rhs, outs=[init_result]) + + # CHECK-LABEL: @test_i32i32i16_matmul + # CHECK: ^{{.*}}(%[[A_ARG:.+]]: i32, %[[B_ARG:.+]]: i32, %[[C_ARG:.+]]: i16) + # CHECK-NEXT: %[[A_CAST:.+]] = trunci %[[A_ARG]] : i32 to i16 + # CHECK-NEXT: %[[B_CAST:.+]] = trunci %[[B_ARG]] : i32 to i16 + # CHECK-NEXT: %[[MUL:.+]] = muli %[[A_CAST]], %[[B_CAST]] : i16 + # CHECK-NEXT: %[[ADD:.+]] = addi %[[C_ARG]], %[[MUL]] : i16 + # CHECK-NEXT: linalg.yield %[[ADD]] : i16 + # CHECK-NEXT: -> tensor<4x8xi16> + @build_function(RankedTensorType.get((4, 16), i32), + RankedTensorType.get((16, 8), i32), + RankedTensorType.get((4, 8), i16)) + def test_i32i32i16_matmul(lhs, rhs, init_result): + return matmul_poly(lhs, rhs, outs=[init_result]) + + # CHECK-LABEL: @test_i8i8f32_matmul + # CHECK: ^{{.*}}(%[[A_ARG:.+]]: i8, %[[B_ARG:.+]]: i8, %[[C_ARG:.+]]: f32) + # CHECK-NEXT: %[[A_CAST:.+]] = sitofp %[[A_ARG]] : i8 to f32 + # CHECK-NEXT: %[[B_CAST:.+]] = sitofp %[[B_ARG]] : i8 to f32 + # CHECK-NEXT: %[[MUL:.+]] = mulf %[[A_CAST]], %[[B_CAST]] : f32 + # CHECK-NEXT: %[[ADD:.+]] = addf %[[C_ARG]], %[[MUL]] : f32 + # CHECK-NEXT: linalg.yield %[[ADD]] : f32 + # CHECK-NEXT: -> tensor<4x8xf32> + @build_function(RankedTensorType.get((4, 16), i8), + RankedTensorType.get((16, 8), i8), + RankedTensorType.get((4, 8), f32)) + def test_i8i8f32_matmul(lhs, rhs, init_result): + return matmul_poly(lhs, rhs, outs=[init_result]) + + # CHECK-LABEL: @test_f16f16f32_matmul + # CHECK: ^{{.*}}(%[[A_ARG:.+]]: f16, %[[B_ARG:.+]]: f16, %[[C_ARG:.+]]: f32) + # CHECK-NEXT: %[[A_CAST:.+]] = fpext %[[A_ARG]] : f16 to f32 + # CHECK-NEXT: %[[B_CAST:.+]] = fpext %[[B_ARG]] : f16 to f32 + # CHECK-NEXT: %[[MUL:.+]] = mulf %[[A_CAST]], %[[B_CAST]] : f32 + # CHECK-NEXT: %[[ADD:.+]] = addf %[[C_ARG]], %[[MUL]] : f32 + # CHECK-NEXT: linalg.yield %[[ADD]] : f32 + # CHECK-NEXT: -> tensor<4x8xf32> + @build_function(RankedTensorType.get((4, 16), f16), + RankedTensorType.get((16, 8), f16), + RankedTensorType.get((4, 8), f32)) + def test_f16f16f32_matmul(lhs, rhs, init_result): + return matmul_poly(lhs, rhs, outs=[init_result]) + + # CHECK-LABEL: @test_f64f64f32_matmul + # CHECK: ^{{.*}}(%[[A_ARG:.+]]: f64, %[[B_ARG:.+]]: f64, %[[C_ARG:.+]]: f32) + # CHECK-NEXT: %[[A_CAST:.+]] = fptrunc %[[A_ARG]] : f64 to f32 + # CHECK-NEXT: %[[B_CAST:.+]] = fptrunc %[[B_ARG]] : f64 to f32 + # CHECK-NEXT: %[[MUL:.+]] = mulf %[[A_CAST]], %[[B_CAST]] : f32 + # CHECK-NEXT: %[[ADD:.+]] = addf %[[C_ARG]], %[[MUL]] : f32 + # CHECK-NEXT: linalg.yield %[[ADD]] : f32 + # CHECK-NEXT: -> tensor<4x8xf32> + @build_function(RankedTensorType.get((4, 16), f64), + RankedTensorType.get((16, 8), f64), + RankedTensorType.get((4, 8), f32)) + def test_f64f64f32_matmul(lhs, rhs, init_result): + return matmul_poly(lhs, rhs, outs=[init_result]) + + +print(module) diff --git a/mlir/test/Bindings/Python/ir_types.py b/mlir/test/Bindings/Python/ir_types.py index 59b4b50b533d..ea05c1561f74 100644 --- a/mlir/test/Bindings/Python/ir_types.py +++ b/mlir/test/Bindings/Python/ir_types.py @@ -59,6 +59,21 @@ def testTypeEq(): run(testTypeEq) +# CHECK-LABEL: TEST: testTypeIsInstance +def testTypeIsInstance(): + ctx = Context() + t1 = Type.parse("i32", ctx) + t2 = Type.parse("f32", ctx) + # CHECK: True + print(IntegerType.isinstance(t1)) + # CHECK: False + print(F32Type.isinstance(t1)) + # CHECK: True + print(F32Type.isinstance(t2)) + +run(testTypeIsInstance) + + # CHECK-LABEL: TEST: testTypeEqDoesNotRaise def testTypeEqDoesNotRaise(): ctx = Context() -- GitLab From a2e0312cda40efc6cbfd7533e162120ce9cd68b8 Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 19 Mar 2021 18:30:11 +0000 Subject: [PATCH 0183/1000] [ARM] Tone down the MVE scalarization overhead The scalarization overhead was set deliberately high for MVE, whilst the codegen was new. It helps protect us against the negative ramifications of mixing scalar and vector instructions. This decreases that, especially for floating point where the cost of extracting/inserting lane elements can be low. For integer the cost is still fairly high due to the cross-register-bank copy, but is no longer n^2 in the length of the vector. In general, this will decrease the cost of scalarizing floats and long integer vectors. i64 increase in cost, having a high cost before and after this patch. For floats this allows up to start doing things like vectorizing fdiv instructions, even if they are scalarized. Differential Revision: https://reviews.llvm.org/D98245 --- .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 13 +- .../Analysis/CostModel/ARM/arith-overflow.ll | 216 ++++++------ .../test/Analysis/CostModel/ARM/arith-ssat.ll | 44 +-- .../test/Analysis/CostModel/ARM/arith-usat.ll | 40 +-- llvm/test/Analysis/CostModel/ARM/arith.ll | 252 +++++++------- llvm/test/Analysis/CostModel/ARM/cast.ll | 224 ++++++------- llvm/test/Analysis/CostModel/ARM/cast_ldst.ll | 32 +- llvm/test/Analysis/CostModel/ARM/cmps.ll | 8 +- llvm/test/Analysis/CostModel/ARM/divrem.ll | 264 +++++++-------- llvm/test/Analysis/CostModel/ARM/fparith.ll | 72 ++-- .../CostModel/ARM/intrinsic-cost-kinds.ll | 42 +-- .../test/Analysis/CostModel/ARM/load_store.ll | 16 +- llvm/test/Analysis/CostModel/ARM/mve-abs.ll | 20 +- llvm/test/Analysis/CostModel/ARM/mve-cmp.ll | 52 +-- .../CostModel/ARM/mve-gather-scatter-cost.ll | 138 ++++---- .../test/Analysis/CostModel/ARM/mve-minmax.ll | 160 ++++----- .../CostModel/ARM/mve-vecreduce-add.ll | 308 +++++++++--------- .../Analysis/CostModel/ARM/reduce-smax.ll | 40 +-- .../Analysis/CostModel/ARM/reduce-smin.ll | 40 +-- .../Analysis/CostModel/ARM/reduce-umax.ll | 40 +-- .../Analysis/CostModel/ARM/reduce-umin.ll | 40 +-- llvm/test/Analysis/CostModel/ARM/select.ll | 18 +- llvm/test/Analysis/CostModel/ARM/shuffle.ll | 100 +++--- .../LoopVectorize/ARM/mve-icmpcost.ll | 30 +- .../LoopVectorize/ARM/mve-interleaved-cost.ll | 264 +++++++-------- .../LoopVectorize/ARM/mve-saddsatcost.ll | 2 +- .../LoopVectorize/ARM/pointer_iv.ll | 56 ++-- .../ARM/tail-folding-not-allowed.ll | 67 +++- 28 files changed, 1324 insertions(+), 1274 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp index 4761a502026c..c27a0e5c2285 100644 --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -817,13 +817,12 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement || Opcode == Instruction::ExtractElement)) { - // We say MVE moves costs at least the MVEVectorCostFactor, even though - // they are scalar instructions. This helps prevent mixing scalar and - // vector, to prevent vectorising where we end up just scalarising the - // result anyway. - return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index), - ST->getMVEVectorCostFactor(TTI::TCK_RecipThroughput)) * - cast(ValTy)->getNumElements() / 2; + // Integer cross-lane moves are more expensive than float, which can + // sometimes just be vmovs. Integer involve being passes to GPR registers, + // causing more of a delay. + std::pair LT = + getTLI()->getTypeLegalizationCost(DL, ValTy->getScalarType()); + return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1); } return BaseT::getVectorInstrCost(Opcode, ValTy, Index); diff --git a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll index ce052b3398e5..5a74d68f8a73 100644 --- a/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll +++ b/llvm/test/Analysis/CostModel/ARM/arith-overflow.ll @@ -67,21 +67,21 @@ define i32 @sadd(i32 %arg) { ; ; MVE-RECIP-LABEL: 'sadd' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 242 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 866 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 370 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 738 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1890 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 274 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 546 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1874 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7332 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 170 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 530 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1060 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1098 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7316 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 28968 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2088 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; V8M-SIZE-LABEL: 'sadd' @@ -124,21 +124,21 @@ define i32 @sadd(i32 %arg) { ; ; MVE-SIZE-LABEL: 'sadd' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 787 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 211 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.sadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I16 = call { i16, i1 } @llvm.sadd.with.overflow.i16(i16 undef, i16 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.sadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 779 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3092 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.sadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 404 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.sadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I8 = call { i8, i1 } @llvm.sadd.with.overflow.i8(i8 undef, i8 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.sadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3084 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12310 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.sadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 790 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.sadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %I64 = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 undef, i64 undef) @@ -225,21 +225,21 @@ define i32 @uadd(i32 %arg) { ; ; MVE-RECIP-LABEL: 'uadd' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1040 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1032 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4112 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; V8M-SIZE-LABEL: 'uadd' @@ -282,21 +282,21 @@ define i32 @uadd(i32 %arg) { ; ; MVE-SIZE-LABEL: 'uadd' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.uadd.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.uadd.with.overflow.i16(i16 undef, i16 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.uadd.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 260 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1032 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.uadd.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.uadd.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.uadd.with.overflow.i8(i8 undef, i8 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.uadd.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1028 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4104 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.uadd.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.uadd.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %I64 = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 undef, i64 undef) @@ -383,21 +383,21 @@ define i32 @ssub(i32 %arg) { ; ; MVE-RECIP-LABEL: 'ssub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 242 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 866 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 370 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 738 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 90 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 498 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1890 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 274 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 546 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1874 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7332 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 170 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 530 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1060 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1098 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7316 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 28968 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2088 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; V8M-SIZE-LABEL: 'ssub' @@ -440,21 +440,21 @@ define i32 @ssub(i32 %arg) { ; ; MVE-SIZE-LABEL: 'ssub' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 150 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 787 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 107 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 211 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.ssub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I16 = call { i16, i1 } @llvm.ssub.with.overflow.i16(i16 undef, i16 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.ssub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 779 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3092 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 203 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.ssub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 404 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.ssub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %I8 = call { i8, i1 } @llvm.ssub.with.overflow.i8(i8 undef, i8 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.ssub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3084 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12310 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 396 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.ssub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 790 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.ssub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %I64 = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 undef, i64 undef) @@ -541,21 +541,21 @@ define i32 @usub(i32 %arg) { ; ; MVE-RECIP-LABEL: 'usub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1040 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1032 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4112 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; V8M-SIZE-LABEL: 'usub' @@ -598,21 +598,21 @@ define i32 @usub(i32 %arg) { ; ; MVE-SIZE-LABEL: 'usub' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 145 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.usub.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 undef, i16 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.usub.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 260 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1032 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.usub.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.usub.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call { i8, i1 } @llvm.usub.with.overflow.i8(i8 undef, i8 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.usub.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1028 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4104 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.usub.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.usub.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %I64 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 undef, i64 undef) @@ -699,21 +699,21 @@ define i32 @smul(i32 %arg) { ; ; MVE-RECIP-LABEL: 'smul' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 768 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1056 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2112 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 380 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1464 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 348 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1016 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 420 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1512 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 228 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 616 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1252 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4712 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 356 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 872 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; V8M-SIZE-LABEL: 'smul' @@ -756,21 +756,21 @@ define i32 @smul(i32 %arg) { ; ; MVE-SIZE-LABEL: 'smul' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 85 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 165 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 325 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 218 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 816 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 79 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 186 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 368 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.smul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I16 = call { i16, i1 } @llvm.smul.with.overflow.i16(i16 undef, i16 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.smul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 332 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1174 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.smul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 278 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.smul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I8 = call { i8, i1 } @llvm.smul.with.overflow.i8(i8 undef, i8 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.smul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1164 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4374 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 268 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.smul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 534 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.smul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %I64 = call {i64, i1} @llvm.smul.with.overflow.i64(i64 undef, i64 undef) @@ -857,21 +857,21 @@ define i32 @umul(i32 %arg) { ; ; MVE-RECIP-LABEL: 'umul' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 248 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 624 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 492 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 984 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1968 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 84 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1456 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 116 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 344 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1008 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1504 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 608 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1248 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4704 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 864 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; V8M-SIZE-LABEL: 'umul' @@ -914,21 +914,21 @@ define i32 @umul(i32 %arg) { ; ; MVE-SIZE-LABEL: 'umul' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 93 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 181 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 812 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 364 for instruction: %V16I32 = call { <16 x i32>, <16 x i1> } @llvm.umul.with.overflow.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I16 = call { i16, i1 } @llvm.umul.with.overflow.i16(i16 undef, i16 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V8I16 = call { <8 x i16>, <8 x i1> } @llvm.umul.with.overflow.v8i16(<8 x i16> undef, <8 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1170 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 138 for instruction: %V16I16 = call { <16 x i16>, <16 x i1> } @llvm.umul.with.overflow.v16i16(<16 x i16> undef, <16 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 274 for instruction: %V32I16 = call { <32 x i16>, <32 x i1> } @llvm.umul.with.overflow.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I8 = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 undef, i8 undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V16I8 = call { <16 x i8>, <16 x i1> } @llvm.umul.with.overflow.v16i8(<16 x i8> undef, <16 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1162 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4370 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 266 for instruction: %V32I8 = call { <32 x i8>, <32 x i1> } @llvm.umul.with.overflow.v32i8(<32 x i8> undef, <32 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 530 for instruction: %V64I8 = call { <64 x i8>, <64 x i1> } @llvm.umul.with.overflow.v64i8(<64 x i8> undef, <64 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef ; %I64 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 undef, i64 undef) diff --git a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll index 251392af24c7..2aefb3a8fd59 100644 --- a/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll +++ b/llvm/test/Analysis/CostModel/ARM/arith-ssat.ll @@ -85,22 +85,22 @@ define i32 @add(i32 %arg) { ; ; MVE-RECIP-LABEL: 'add' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1330 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 302 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 602 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1202 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -160,22 +160,22 @@ define i32 @add(i32 %arg) { ; ; MVE-SIZE-LABEL: 'add' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2I32 = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.sadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %I16 = call i16 @llvm.sadd.sat.i16(i16 undef, i16 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2I16 = call <2 x i16> @llvm.sadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.sadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -291,22 +291,22 @@ define i32 @sub(i32 %arg) { ; ; MVE-RECIP-LABEL: 'sub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 118 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 378 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1330 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 302 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 602 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1202 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 180 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -366,22 +366,22 @@ define i32 @sub(i32 %arg) { ; ; MVE-SIZE-LABEL: 'sub' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 81 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 153 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2I32 = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.ssub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %I16 = call i16 @llvm.ssub.sat.i16(i16 undef, i16 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2I16 = call <2 x i16> @llvm.ssub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.ssub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll index 342956cb83e8..9baec1e7d25f 100644 --- a/llvm/test/Analysis/CostModel/ARM/arith-usat.ll +++ b/llvm/test/Analysis/CostModel/ARM/arith-usat.ll @@ -85,22 +85,22 @@ define i32 @add(i32 %arg) { ; ; MVE-RECIP-LABEL: 'add' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -160,22 +160,22 @@ define i32 @add(i32 %arg) { ; ; MVE-SIZE-LABEL: 'add' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.uadd.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.uadd.sat.i16(i16 undef, i16 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I16 = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.uadd.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.uadd.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -291,22 +291,22 @@ define i32 @sub(i32 %arg) { ; ; MVE-RECIP-LABEL: 'sub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -366,22 +366,22 @@ define i32 @sub(i32 %arg) { ; ; MVE-SIZE-LABEL: 'sub' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.usub.sat.i16(i16 undef, i16 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I16 = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I8 = call <8 x i8> @llvm.usub.sat.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.usub.sat.v16i8(<16 x i8> undef, <16 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/arith.ll b/llvm/test/Analysis/CostModel/ARM/arith.ll index cbeaa8e97f06..f1f11e2484ac 100644 --- a/llvm/test/Analysis/CostModel/ARM/arith.ll +++ b/llvm/test/Analysis/CostModel/ARM/arith.ll @@ -375,12 +375,12 @@ define void @i64() { define void @vi8() { ; CHECK-MVE1-LABEL: 'vi8' -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = add <2 x i8> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = sub <2 x i8> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = mul <2 x i8> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f2 = ashr <2 x i8> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g2 = lshr <2 x i8> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h2 = shl <2 x i8> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i8> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i8> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i8> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i8> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i8> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i8> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i8> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i8> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i8> undef, undef @@ -414,12 +414,12 @@ define void @vi8() { ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE2-LABEL: 'vi8' -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = add <2 x i8> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = sub <2 x i8> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = mul <2 x i8> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f2 = ashr <2 x i8> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %g2 = lshr <2 x i8> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %h2 = shl <2 x i8> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i8> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i8> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i8> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i8> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i8> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i8> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i8> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i8> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i8> undef, undef @@ -609,12 +609,12 @@ define void @vi8() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-SIZE-LABEL: 'vi8' -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = add <2 x i8> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = sub <2 x i8> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = mul <2 x i8> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f2 = ashr <2 x i8> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %g2 = lshr <2 x i8> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %h2 = shl <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i8> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i8> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i8> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i8> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i8> undef, undef @@ -688,12 +688,12 @@ define void @vi8() { define void @vi16() { ; CHECK-MVE1-LABEL: 'vi16' -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = add <2 x i16> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = sub <2 x i16> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = mul <2 x i16> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f2 = ashr <2 x i16> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g2 = lshr <2 x i16> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h2 = shl <2 x i16> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i16> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i16> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i16> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i16> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i16> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i16> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i16> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i16> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i16> undef, undef @@ -727,12 +727,12 @@ define void @vi16() { ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE2-LABEL: 'vi16' -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = add <2 x i16> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = sub <2 x i16> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = mul <2 x i16> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f2 = ashr <2 x i16> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %g2 = lshr <2 x i16> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %h2 = shl <2 x i16> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i16> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i16> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i16> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i16> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i16> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i16> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i16> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i16> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i16> undef, undef @@ -922,12 +922,12 @@ define void @vi16() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-SIZE-LABEL: 'vi16' -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = add <2 x i16> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = sub <2 x i16> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = mul <2 x i16> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f2 = ashr <2 x i16> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %g2 = lshr <2 x i16> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %h2 = shl <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i16> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i16> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i16> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i16> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i16> undef, undef @@ -1001,12 +1001,12 @@ define void @vi16() { define void @vi32() { ; CHECK-MVE1-LABEL: 'vi32' -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = add <2 x i32> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = sub <2 x i32> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = mul <2 x i32> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f2 = ashr <2 x i32> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %g2 = lshr <2 x i32> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %h2 = shl <2 x i32> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i32> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i32> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i32> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i32> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i32> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i32> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i32> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i32> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i32> undef, undef @@ -1040,12 +1040,12 @@ define void @vi32() { ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE2-LABEL: 'vi32' -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = add <2 x i32> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = sub <2 x i32> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = mul <2 x i32> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f2 = ashr <2 x i32> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %g2 = lshr <2 x i32> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %h2 = shl <2 x i32> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i32> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i32> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i32> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i32> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i32> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i32> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i32> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i32> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i32> undef, undef @@ -1235,12 +1235,12 @@ define void @vi32() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-SIZE-LABEL: 'vi32' -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = add <2 x i32> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = sub <2 x i32> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = mul <2 x i32> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %f2 = ashr <2 x i32> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %g2 = lshr <2 x i32> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %h2 = shl <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %c2 = add <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %d2 = sub <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %e2 = mul <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %f2 = ashr <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %g2 = lshr <2 x i32> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %h2 = shl <2 x i32> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i32> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i32> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i32> undef, undef @@ -1314,21 +1314,21 @@ define void @vi32() { define void @vi64() { ; CHECK-MVE1-LABEL: 'vi64' -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c2 = add <2 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d2 = sub <2 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e2 = mul <2 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f2 = ashr <2 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g2 = lshr <2 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h2 = shl <2 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c2 = add <2 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d2 = sub <2 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e2 = mul <2 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = ashr <2 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %g2 = lshr <2 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %h2 = shl <2 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %c4 = add <4 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %d4 = sub <4 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %e4 = mul <4 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f4 = ashr <4 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %g4 = lshr <4 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %h4 = shl <4 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %c4 = add <4 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %d4 = sub <4 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = mul <4 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f4 = ashr <4 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %g4 = lshr <4 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %h4 = shl <4 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i4 = and <4 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j4 = or <4 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k4 = xor <4 x i64> undef, undef @@ -1341,33 +1341,33 @@ define void @vi64() { ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i8 = and <8 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j8 = or <8 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k8 = xor <8 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %c16 = add <16 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %d16 = sub <16 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %e16 = mul <16 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %f16 = ashr <16 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %g16 = lshr <16 x i64> undef, undef -; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %h16 = shl <16 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %c16 = add <16 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %d16 = sub <16 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %e16 = mul <16 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %f16 = ashr <16 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %g16 = lshr <16 x i64> undef, undef +; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %h16 = shl <16 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i16 = and <16 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j16 = or <16 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k16 = xor <16 x i64> undef, undef ; CHECK-MVE1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE2-LABEL: 'vi64' -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c2 = add <2 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d2 = sub <2 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e2 = mul <2 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f2 = ashr <2 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g2 = lshr <2 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h2 = shl <2 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c2 = add <2 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d2 = sub <2 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e2 = mul <2 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = ashr <2 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %g2 = lshr <2 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %h2 = shl <2 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i2 = and <2 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j2 = or <2 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k2 = xor <2 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %c4 = add <4 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %d4 = sub <4 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %e4 = mul <4 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f4 = ashr <4 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %g4 = lshr <4 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %h4 = shl <4 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %c4 = add <4 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %d4 = sub <4 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = mul <4 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f4 = ashr <4 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %g4 = lshr <4 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %h4 = shl <4 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i4 = and <4 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j4 = or <4 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k4 = xor <4 x i64> undef, undef @@ -1380,24 +1380,24 @@ define void @vi64() { ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i8 = and <8 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j8 = or <8 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k8 = xor <8 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %c16 = add <16 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %d16 = sub <16 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %e16 = mul <16 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %f16 = ashr <16 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %g16 = lshr <16 x i64> undef, undef -; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %h16 = shl <16 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %c16 = add <16 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %d16 = sub <16 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %e16 = mul <16 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %f16 = ashr <16 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %g16 = lshr <16 x i64> undef, undef +; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %h16 = shl <16 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i16 = and <16 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j16 = or <16 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k16 = xor <16 x i64> undef, undef ; CHECK-MVE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE4-LABEL: 'vi64' -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %c2 = add <2 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %d2 = sub <2 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %e2 = mul <2 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %f2 = ashr <2 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %g2 = lshr <2 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %h2 = shl <2 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c2 = add <2 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d2 = sub <2 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e2 = mul <2 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = ashr <2 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %g2 = lshr <2 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %h2 = shl <2 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i2 = and <2 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j2 = or <2 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k2 = xor <2 x i64> undef, undef @@ -1410,21 +1410,21 @@ define void @vi64() { ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i4 = and <4 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j4 = or <4 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k4 = xor <4 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %c8 = add <8 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %d8 = sub <8 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %e8 = mul <8 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %f8 = ashr <8 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %g8 = lshr <8 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %h8 = shl <8 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %c8 = add <8 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %d8 = sub <8 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e8 = mul <8 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %f8 = ashr <8 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %g8 = lshr <8 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %h8 = shl <8 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %i8 = and <8 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %j8 = or <8 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %k8 = xor <8 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %c16 = add <16 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %d16 = sub <16 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %e16 = mul <16 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %f16 = ashr <16 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %g16 = lshr <16 x i64> undef, undef -; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %h16 = shl <16 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %c16 = add <16 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %d16 = sub <16 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %e16 = mul <16 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %f16 = ashr <16 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %g16 = lshr <16 x i64> undef, undef +; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %h16 = shl <16 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %i16 = and <16 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %j16 = or <16 x i64> undef, undef ; CHECK-MVE4-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %k16 = xor <16 x i64> undef, undef @@ -1548,21 +1548,21 @@ define void @vi64() { ; CHECK-V8R-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-SIZE-LABEL: 'vi64' -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c2 = add <2 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d2 = sub <2 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e2 = mul <2 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %f2 = ashr <2 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %g2 = lshr <2 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %h2 = shl <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c2 = add <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d2 = sub <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e2 = mul <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = ashr <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %g2 = lshr <2 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %h2 = shl <2 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %i2 = and <2 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %j2 = or <2 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %k2 = xor <2 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %c4 = add <4 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %d4 = sub <4 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %e4 = mul <4 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f4 = ashr <4 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %g4 = lshr <4 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %h4 = shl <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %c4 = add <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %d4 = sub <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = mul <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f4 = ashr <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %g4 = lshr <4 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %h4 = shl <4 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %i4 = and <4 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %j4 = or <4 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %k4 = xor <4 x i64> undef, undef @@ -1575,12 +1575,12 @@ define void @vi64() { ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %i8 = and <8 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %j8 = or <8 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %k8 = xor <8 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %c16 = add <16 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %d16 = sub <16 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %e16 = mul <16 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %f16 = ashr <16 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %g16 = lshr <16 x i64> undef, undef -; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %h16 = shl <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %c16 = add <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %d16 = sub <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %e16 = mul <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %f16 = ashr <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %g16 = lshr <16 x i64> undef, undef +; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %h16 = shl <16 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %i16 = and <16 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %j16 = or <16 x i64> undef, undef ; CHECK-MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %k16 = xor <16 x i64> undef, undef diff --git a/llvm/test/Analysis/CostModel/ARM/cast.ll b/llvm/test/Analysis/CostModel/ARM/cast.ll index f8377ad3f0c1..bf2488eec97b 100644 --- a/llvm/test/Analysis/CostModel/ARM/cast.ll +++ b/llvm/test/Analysis/CostModel/ARM/cast.ll @@ -557,36 +557,36 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r87hf = fpext <4 x half> undef to <4 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r88hf = fpext <8 x half> undef to <8 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %r89hf = fpext <16 x half> undef to <16 x float> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r90h = fptoui <2 x half> undef to <2 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r91h = fptosi <2 x half> undef to <2 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r92h = fptoui <2 x half> undef to <2 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r93h = fptosi <2 x half> undef to <2 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r94h = fptoui <2 x half> undef to <2 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r95h = fptosi <2 x half> undef to <2 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r96h = fptoui <2 x half> undef to <2 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r97h = fptosi <2 x half> undef to <2 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r98h = fptoui <2 x half> undef to <2 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r99h = fptosi <2 x half> undef to <2 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r90 = fptoui <2 x float> undef to <2 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r91 = fptosi <2 x float> undef to <2 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r92 = fptoui <2 x float> undef to <2 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r93 = fptosi <2 x float> undef to <2 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r94 = fptoui <2 x float> undef to <2 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r95 = fptosi <2 x float> undef to <2 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r96 = fptoui <2 x float> undef to <2 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r97 = fptosi <2 x float> undef to <2 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %r98 = fptoui <2 x float> undef to <2 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %r99 = fptosi <2 x float> undef to <2 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r90h = fptoui <2 x half> undef to <2 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r91h = fptosi <2 x half> undef to <2 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r92h = fptoui <2 x half> undef to <2 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r93h = fptosi <2 x half> undef to <2 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r94h = fptoui <2 x half> undef to <2 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r95h = fptosi <2 x half> undef to <2 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r96h = fptoui <2 x half> undef to <2 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r97h = fptosi <2 x half> undef to <2 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %r98h = fptoui <2 x half> undef to <2 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %r99h = fptosi <2 x half> undef to <2 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r100 = fptoui <2 x double> undef to <2 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r101 = fptosi <2 x double> undef to <2 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r102 = fptoui <2 x double> undef to <2 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r103 = fptosi <2 x double> undef to <2 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r104 = fptoui <2 x double> undef to <2 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r105 = fptosi <2 x double> undef to <2 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r106 = fptoui <2 x double> undef to <2 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %r107 = fptosi <2 x double> undef to <2 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %r108 = fptoui <2 x double> undef to <2 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %r109 = fptosi <2 x double> undef to <2 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r110 = fptoui <4 x float> undef to <4 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r111 = fptosi <4 x float> undef to <4 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r112 = fptoui <4 x float> undef to <4 x i8> @@ -595,8 +595,8 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r115 = fptosi <4 x float> undef to <4 x i16> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r116 = fptoui <4 x float> undef to <4 x i32> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r117 = fptosi <4 x float> undef to <4 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 274 for instruction: %r118 = fptoui <4 x float> undef to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 274 for instruction: %r119 = fptosi <4 x float> undef to <4 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r110h = fptoui <4 x half> undef to <4 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r111h = fptosi <4 x half> undef to <4 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r112h = fptoui <4 x half> undef to <4 x i8> @@ -605,18 +605,18 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r115h = fptosi <4 x half> undef to <4 x i16> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r116h = fptoui <4 x half> undef to <4 x i32> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r117h = fptosi <4 x half> undef to <4 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r118h = fptoui <4 x half> undef to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r119h = fptosi <4 x half> undef to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 274 for instruction: %r118h = fptoui <4 x half> undef to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 274 for instruction: %r119h = fptosi <4 x half> undef to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %r120 = fptoui <4 x double> undef to <4 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %r121 = fptosi <4 x double> undef to <4 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %r122 = fptoui <4 x double> undef to <4 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %r123 = fptosi <4 x double> undef to <4 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %r124 = fptoui <4 x double> undef to <4 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %r125 = fptosi <4 x double> undef to <4 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %r126 = fptoui <4 x double> undef to <4 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %r127 = fptosi <4 x double> undef to <4 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %r128 = fptoui <4 x double> undef to <4 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %r129 = fptosi <4 x double> undef to <4 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r130 = fptoui <8 x float> undef to <8 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r131 = fptosi <8 x float> undef to <8 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r132 = fptoui <8 x float> undef to <8 x i8> @@ -625,8 +625,8 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r135 = fptosi <8 x float> undef to <8 x i16> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r136 = fptoui <8 x float> undef to <8 x i32> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r137 = fptosi <8 x float> undef to <8 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 328 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 328 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1096 for instruction: %r138 = fptoui <8 x float> undef to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1096 for instruction: %r139 = fptosi <8 x float> undef to <8 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r130h = fptoui <8 x half> undef to <8 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r131h = fptosi <8 x half> undef to <8 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r132h = fptoui <8 x half> undef to <8 x i8> @@ -635,18 +635,18 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r135h = fptosi <8 x half> undef to <8 x i16> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r136h = fptoui <8 x half> undef to <8 x i32> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r137h = fptosi <8 x half> undef to <8 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %r138h = fptoui <8 x half> undef to <8 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %r139h = fptosi <8 x half> undef to <8 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 328 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 328 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1098 for instruction: %r138h = fptoui <8 x half> undef to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1098 for instruction: %r139h = fptosi <8 x half> undef to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 586 for instruction: %r140 = fptoui <8 x double> undef to <8 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 586 for instruction: %r141 = fptosi <8 x double> undef to <8 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 586 for instruction: %r142 = fptoui <8 x double> undef to <8 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 586 for instruction: %r143 = fptosi <8 x double> undef to <8 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 586 for instruction: %r144 = fptoui <8 x double> undef to <8 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 586 for instruction: %r145 = fptosi <8 x double> undef to <8 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 584 for instruction: %r146 = fptoui <8 x double> undef to <8 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 584 for instruction: %r147 = fptosi <8 x double> undef to <8 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1088 for instruction: %r148 = fptoui <8 x double> undef to <8 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1088 for instruction: %r149 = fptosi <8 x double> undef to <8 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %r150 = fptoui <16 x float> undef to <16 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %r151 = fptosi <16 x float> undef to <16 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %r152 = fptoui <16 x float> undef to <16 x i8> @@ -655,8 +655,8 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r155 = fptosi <16 x float> undef to <16 x i16> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r156 = fptoui <16 x float> undef to <16 x i32> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r157 = fptosi <16 x float> undef to <16 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1312 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1312 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4384 for instruction: %r158 = fptoui <16 x float> undef to <16 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4384 for instruction: %r159 = fptosi <16 x float> undef to <16 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r150h = fptoui <16 x half> undef to <16 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r151h = fptosi <16 x half> undef to <16 x i1> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r152h = fptoui <16 x half> undef to <16 x i8> @@ -665,18 +665,18 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r155h = fptosi <16 x half> undef to <16 x i16> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r156h = fptoui <16 x half> undef to <16 x i32> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r157h = fptosi <16 x half> undef to <16 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %r158h = fptoui <16 x half> undef to <16 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %r159h = fptosi <16 x half> undef to <16 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1312 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1312 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1280 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4392 for instruction: %r158h = fptoui <16 x half> undef to <16 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4392 for instruction: %r159h = fptosi <16 x half> undef to <16 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2346 for instruction: %r160 = fptoui <16 x double> undef to <16 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2346 for instruction: %r161 = fptosi <16 x double> undef to <16 x i1> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2346 for instruction: %r162 = fptoui <16 x double> undef to <16 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2346 for instruction: %r163 = fptosi <16 x double> undef to <16 x i8> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2344 for instruction: %r164 = fptoui <16 x double> undef to <16 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2344 for instruction: %r165 = fptosi <16 x double> undef to <16 x i16> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2336 for instruction: %r166 = fptoui <16 x double> undef to <16 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2336 for instruction: %r167 = fptosi <16 x double> undef to <16 x i32> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4352 for instruction: %r168 = fptoui <16 x double> undef to <16 x i64> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4352 for instruction: %r169 = fptosi <16 x double> undef to <16 x i64> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r170 = uitofp <2 x i1> undef to <2 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r171 = sitofp <2 x i1> undef to <2 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r172 = uitofp <2 x i8> undef to <2 x float> @@ -697,16 +697,16 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r177h = sitofp <2 x i32> undef to <2 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r178h = uitofp <2 x i64> undef to <2 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r179h = sitofp <2 x i64> undef to <2 x half> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r180 = uitofp <2 x i1> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r181 = sitofp <2 x i1> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r182 = uitofp <2 x i8> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r183 = sitofp <2 x i8> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r184 = uitofp <2 x i16> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r185 = sitofp <2 x i16> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r186 = uitofp <2 x i32> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r187 = sitofp <2 x i32> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r188 = uitofp <2 x i64> undef to <2 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %r189 = sitofp <2 x i64> undef to <2 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r190 = uitofp <4 x i1> undef to <4 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r191 = sitofp <4 x i1> undef to <4 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r192 = uitofp <4 x i8> undef to <4 x float> @@ -727,16 +727,16 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r197h = sitofp <4 x i32> undef to <4 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r198h = uitofp <4 x i64> undef to <4 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r199h = sitofp <4 x i64> undef to <4 x half> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 130 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %r200 = uitofp <4 x i1> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %r201 = sitofp <4 x i1> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %r202 = uitofp <4 x i8> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %r203 = sitofp <4 x i8> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %r204 = uitofp <4 x i16> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %r205 = sitofp <4 x i16> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %r206 = uitofp <4 x i32> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %r207 = sitofp <4 x i32> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %r208 = uitofp <4 x i64> undef to <4 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %r209 = sitofp <4 x i64> undef to <4 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r210 = uitofp <8 x i1> undef to <8 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r211 = sitofp <8 x i1> undef to <8 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r212 = uitofp <8 x i8> undef to <8 x float> @@ -757,16 +757,16 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r217h = sitofp <8 x i32> undef to <8 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %r218h = uitofp <8 x i64> undef to <8 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %r219h = sitofp <8 x i64> undef to <8 x half> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 522 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 522 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 522 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 522 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 522 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 522 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 522 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 522 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 512 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 512 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 394 for instruction: %r220 = uitofp <8 x i1> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 394 for instruction: %r221 = sitofp <8 x i1> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 394 for instruction: %r222 = uitofp <8 x i8> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 394 for instruction: %r223 = sitofp <8 x i8> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 394 for instruction: %r224 = uitofp <8 x i16> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 394 for instruction: %r225 = sitofp <8 x i16> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 394 for instruction: %r226 = uitofp <8 x i16> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 394 for instruction: %r227 = sitofp <8 x i16> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %r228 = uitofp <8 x i64> undef to <8 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 384 for instruction: %r229 = sitofp <8 x i64> undef to <8 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %r230 = uitofp <16 x i1> undef to <16 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %r231 = sitofp <16 x i1> undef to <16 x float> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %r232 = uitofp <16 x i8> undef to <16 x float> @@ -787,16 +787,16 @@ define i32 @casts() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %r237h = sitofp <16 x i32> undef to <16 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %r238h = uitofp <16 x i64> undef to <16 x half> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 168 for instruction: %r239h = sitofp <16 x i64> undef to <16 x half> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2090 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2090 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2090 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2090 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2088 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2088 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2088 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2088 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2048 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double> -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2048 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1578 for instruction: %r240 = uitofp <16 x i1> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1578 for instruction: %r241 = sitofp <16 x i1> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1578 for instruction: %r242 = uitofp <16 x i8> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1578 for instruction: %r243 = sitofp <16 x i8> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1576 for instruction: %r244 = uitofp <16 x i16> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1576 for instruction: %r245 = sitofp <16 x i16> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1576 for instruction: %r246 = uitofp <16 x i16> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1576 for instruction: %r247 = sitofp <16 x i16> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %r248 = uitofp <16 x i64> undef to <16 x double> +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1536 for instruction: %r249 = sitofp <16 x i64> undef to <16 x double> ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; CHECK-V8M-MAIN-RECIP-LABEL: 'casts' diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll index 4406e27acaba..afa616260b3e 100644 --- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll +++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll @@ -81,14 +81,14 @@ define i32 @load_extends() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi8 = load i8, i8* undef, align 1 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi16 = load i16, i16* undef, align 2 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadi32 = load i32, i32* undef, align 4 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %loadv2i8 = load <2 x i8>, <2 x i8>* undef, align 2 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4i8 = load <4 x i8>, <4 x i8>* undef, align 4 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv8i8 = load <8 x i8>, <8 x i8>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv16i8 = load <16 x i8>, <16 x i8>* undef, align 16 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %loadv2i16 = load <2 x i16>, <2 x i16>* undef, align 4 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4i16 = load <4 x i16>, <4 x i16>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv8i16 = load <8 x i16>, <8 x i16>* undef, align 16 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %loadv2i32 = load <2 x i32>, <2 x i32>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4i32 = load <4 x i32>, <4 x i32>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r0 = sext i8 %loadi8 to i16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext i8 %loadi8 to i16 @@ -774,9 +774,9 @@ define i32 @store_trunc() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %i1632, i16* undef, align 2 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i16 %i1664, i16* undef, align 2 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store i32 %i3264, i32* undef, align 4 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i8> %v2816, <2 x i8>* undef, align 2 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i8> %v2832, <2 x i8>* undef, align 2 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i8> %v2864, <2 x i8>* undef, align 2 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i8> %v2816, <2 x i8>* undef, align 2 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i8> %v2832, <2 x i8>* undef, align 2 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i8> %v2864, <2 x i8>* undef, align 2 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i8> %v4816, <4 x i8>* undef, align 4 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i8> %v4832, <4 x i8>* undef, align 4 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i8> %v4864, <4 x i8>* undef, align 4 @@ -786,13 +786,13 @@ define i32 @store_trunc() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16816, <16 x i8>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16832, <16 x i8>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> %v16864, <16 x i8>* undef, align 16 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i16> %v21632, <2 x i16>* undef, align 4 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i16> %v21664, <2 x i16>* undef, align 4 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i16> %v21632, <2 x i16>* undef, align 4 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i16> %v21664, <2 x i16>* undef, align 4 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i16> %v41632, <4 x i16>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i16> %v41664, <4 x i16>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v81632, <8 x i16>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> %v81664, <8 x i16>* undef, align 16 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i32> %v23264, <2 x i32>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i32> %v23264, <2 x i32>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> %v43264, <4 x i32>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; @@ -1273,11 +1273,11 @@ define i32 @load_fpextends() { ; CHECK-MVE-RECIP-LABEL: 'load_fpextends' ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadf16 = load half, half* undef, align 2 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %loadf32 = load float, float* undef, align 4 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %loadv2f16 = load <2 x half>, <2 x half>* undef, align 4 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %loadv4f16 = load <4 x half>, <4 x half>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %loadv2f16 = load <2 x half>, <2 x half>* undef, align 4 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %loadv4f16 = load <4 x half>, <4 x half>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv8f16 = load <8 x half>, <8 x half>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %loadv16f16 = load <16 x half>, <16 x half>* undef, align 32 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %loadv2f32 = load <2 x float>, <2 x float>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %loadv2f32 = load <2 x float>, <2 x float>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %loadv4f32 = load <4 x float>, <4 x float>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %loadv8f32 = load <8 x float>, <8 x float>* undef, align 32 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r1 = fpext half %loadf16 to float @@ -1567,13 +1567,13 @@ define i32 @load_fptrunc() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store half %i1632, half* undef, align 2 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store half %i1664, half* undef, align 2 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float %i3264, float* undef, align 4 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x half> %v21632, <2 x half>* undef, align 4 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x half> %v21664, <2 x half>* undef, align 4 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <2 x half> %v21632, <2 x half>* undef, align 4 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <2 x half> %v21664, <2 x half>* undef, align 4 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x half> %v41632, <4 x half>* undef, align 8 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: store <4 x half> %v41664, <4 x half>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <4 x half> %v41664, <4 x half>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x half> %v81632, <8 x half>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x half> %v81664, <8 x half>* undef, align 16 -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x float> %v23264, <2 x float>* undef, align 8 +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <2 x float> %v23264, <2 x float>* undef, align 8 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> %v43264, <4 x float>* undef, align 16 ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; diff --git a/llvm/test/Analysis/CostModel/ARM/cmps.ll b/llvm/test/Analysis/CostModel/ARM/cmps.ll index 75f2bd58b212..bda8763ef18d 100644 --- a/llvm/test/Analysis/CostModel/ARM/cmps.ll +++ b/llvm/test/Analysis/CostModel/ARM/cmps.ll @@ -22,9 +22,9 @@ define i32 @cmps() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a9 = fcmp ogt double undef, undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a10 = fcmp olt <8 x half> undef, undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a11 = fcmp oge <4 x float> undef, undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %a12 = fcmp oge <2 x double> undef, undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a12 = fcmp oge <2 x double> undef, undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %p = icmp eq i32* undef, undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %q = icmp eq <4 x i32*> undef, undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %q = icmp eq <4 x i32*> undef, undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; CHECK-V8M-MAIN-RECIP-LABEL: 'cmps' @@ -183,8 +183,8 @@ define void @minmax() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s4 = select <4 x i1> %c4, <4 x i32> undef, <4 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %c5 = icmp slt i32* undef, undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %s5 = select i1 %c5, i32* undef, i32* undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %c6 = icmp slt <4 x i32*> undef, undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %s6 = select <4 x i1> %c6, <4 x i32*> undef, <4 x i32*> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %c6 = icmp slt <4 x i32*> undef, undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %s6 = select <4 x i1> %c6, <4 x i32*> undef, <4 x i32*> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-RECIP-LABEL: 'minmax' diff --git a/llvm/test/Analysis/CostModel/ARM/divrem.ll b/llvm/test/Analysis/CostModel/ARM/divrem.ll index de5ec2034a35..0d9fae25cbb3 100644 --- a/llvm/test/Analysis/CostModel/ARM/divrem.ll +++ b/llvm/test/Analysis/CostModel/ARM/divrem.ll @@ -423,22 +423,22 @@ define void @vi8() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vi8' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t1 = sdiv <2 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t2 = udiv <2 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t3 = srem <2 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t4 = urem <2 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t1 = sdiv <2 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t2 = udiv <2 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t3 = srem <2 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t4 = urem <2 x i8> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f1 = sdiv <4 x i8> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = udiv <4 x i8> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f3 = srem <4 x i8> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f4 = urem <4 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e1 = sdiv <8 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e2 = udiv <8 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e3 = srem <8 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e4 = urem <8 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s1 = sdiv <16 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s2 = udiv <16 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s3 = srem <16 x i8> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s4 = urem <16 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e1 = sdiv <8 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e2 = udiv <8 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e3 = srem <8 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = urem <8 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s1 = sdiv <16 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s2 = udiv <16 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s3 = srem <16 x i8> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s4 = urem <16 x i8> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vi8' @@ -538,22 +538,22 @@ define void @vi16() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vi16' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t1 = sdiv <2 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t2 = udiv <2 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t3 = srem <2 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t4 = urem <2 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t1 = sdiv <2 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t2 = udiv <2 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t3 = srem <2 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t4 = urem <2 x i16> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f1 = sdiv <4 x i16> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = udiv <4 x i16> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f3 = srem <4 x i16> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f4 = urem <4 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e1 = sdiv <8 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e2 = udiv <8 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e3 = srem <8 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e4 = urem <8 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s1 = sdiv <16 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s2 = udiv <16 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s3 = srem <16 x i16> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s4 = urem <16 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e1 = sdiv <8 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e2 = udiv <8 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e3 = srem <8 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = urem <8 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s1 = sdiv <16 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s2 = udiv <16 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s3 = srem <16 x i16> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s4 = urem <16 x i16> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vi16' @@ -653,22 +653,22 @@ define void @vi32() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vi32' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t1 = sdiv <2 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t2 = udiv <2 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t3 = srem <2 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t4 = urem <2 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t1 = sdiv <2 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t2 = udiv <2 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t3 = srem <2 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t4 = urem <2 x i32> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f1 = sdiv <4 x i32> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = udiv <4 x i32> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f3 = srem <4 x i32> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f4 = urem <4 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e1 = sdiv <8 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e2 = udiv <8 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e3 = srem <8 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e4 = urem <8 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s1 = sdiv <16 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s2 = udiv <16 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s3 = srem <16 x i32> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s4 = urem <16 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e1 = sdiv <8 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e2 = udiv <8 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e3 = srem <8 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = urem <8 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s1 = sdiv <16 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s2 = udiv <16 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s3 = srem <16 x i32> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s4 = urem <16 x i32> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vi32' @@ -768,22 +768,22 @@ define void @vi64() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vi64' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t1 = sdiv <2 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t2 = udiv <2 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t3 = srem <2 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t4 = urem <2 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f1 = sdiv <4 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f2 = udiv <4 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f3 = srem <4 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f4 = urem <4 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %t1 = sdiv <2 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %t2 = udiv <2 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %t3 = srem <2 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %t4 = urem <2 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f1 = sdiv <4 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f2 = udiv <4 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %f3 = srem <4 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %f4 = urem <4 x i64> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e1 = sdiv <8 x i64> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e2 = udiv <8 x i64> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e3 = srem <8 x i64> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e4 = urem <8 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %s1 = sdiv <16 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %s2 = udiv <16 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s3 = srem <16 x i64> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s4 = urem <16 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %s1 = sdiv <16 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %s2 = udiv <16 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %s3 = srem <16 x i64> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %s4 = urem <16 x i64> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vi64' @@ -873,12 +873,12 @@ define void @vf16() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vf16' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %1 = fdiv <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %2 = frem <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %3 = fdiv <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %4 = frem <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %5 = fdiv <8 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %6 = frem <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = fdiv <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = frem <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = fdiv <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %4 = frem <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %5 = fdiv <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %6 = frem <8 x half> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vf16' @@ -928,12 +928,12 @@ define void @vf32() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vf32' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %1 = fdiv <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %2 = frem <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %3 = fdiv <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %4 = frem <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %5 = fdiv <8 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %6 = frem <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = fdiv <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = frem <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = fdiv <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %4 = frem <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %5 = fdiv <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %6 = frem <8 x float> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vf32' @@ -983,12 +983,12 @@ define void @vf64() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vf64' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %1 = fdiv <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %2 = frem <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %3 = fdiv <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %4 = frem <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %5 = fdiv <8 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %6 = frem <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = fdiv <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = frem <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = fdiv <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %4 = frem <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %5 = fdiv <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %6 = frem <8 x double> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vf64' @@ -1048,22 +1048,22 @@ define void @vi8_2() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vi8_2' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t1 = sdiv <2 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t2 = udiv <2 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t3 = srem <2 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t4 = urem <2 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t1 = sdiv <2 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t2 = udiv <2 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t3 = srem <2 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t4 = urem <2 x i8> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f1 = sdiv <4 x i8> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = udiv <4 x i8> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f3 = srem <4 x i8> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f4 = urem <4 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e1 = sdiv <8 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e2 = udiv <8 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e3 = srem <8 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e4 = urem <8 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s1 = sdiv <16 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s2 = udiv <16 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s3 = srem <16 x i8> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s4 = urem <16 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e1 = sdiv <8 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e2 = udiv <8 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e3 = srem <8 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = urem <8 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s1 = sdiv <16 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s2 = udiv <16 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s3 = srem <16 x i8> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s4 = urem <16 x i8> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vi8_2' @@ -1163,22 +1163,22 @@ define void @vi16_2() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vi16_2' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t1 = sdiv <2 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t2 = udiv <2 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t3 = srem <2 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t4 = urem <2 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t1 = sdiv <2 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t2 = udiv <2 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t3 = srem <2 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t4 = urem <2 x i16> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f1 = sdiv <4 x i16> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = udiv <4 x i16> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f3 = srem <4 x i16> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f4 = urem <4 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e1 = sdiv <8 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e2 = udiv <8 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e3 = srem <8 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e4 = urem <8 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s1 = sdiv <16 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s2 = udiv <16 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s3 = srem <16 x i16> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s4 = urem <16 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e1 = sdiv <8 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e2 = udiv <8 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e3 = srem <8 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = urem <8 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s1 = sdiv <16 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s2 = udiv <16 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s3 = srem <16 x i16> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s4 = urem <16 x i16> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vi16_2' @@ -1278,22 +1278,22 @@ define void @vi32_2() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vi32_2' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t1 = sdiv <2 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t2 = udiv <2 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t3 = srem <2 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t4 = urem <2 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t1 = sdiv <2 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t2 = udiv <2 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t3 = srem <2 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %t4 = urem <2 x i32> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f1 = sdiv <4 x i32> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f2 = udiv <4 x i32> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f3 = srem <4 x i32> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f4 = urem <4 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e1 = sdiv <8 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e2 = udiv <8 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e3 = srem <8 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e4 = urem <8 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s1 = sdiv <16 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s2 = udiv <16 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s3 = srem <16 x i32> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s4 = urem <16 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e1 = sdiv <8 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e2 = udiv <8 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e3 = srem <8 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %e4 = urem <8 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s1 = sdiv <16 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s2 = udiv <16 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s3 = srem <16 x i32> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %s4 = urem <16 x i32> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vi32_2' @@ -1393,22 +1393,22 @@ define void @vi64_2() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vi64_2' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t1 = sdiv <2 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %t2 = udiv <2 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t3 = srem <2 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %t4 = urem <2 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f1 = sdiv <4 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %f2 = udiv <4 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f3 = srem <4 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %f4 = urem <4 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %t1 = sdiv <2 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %t2 = udiv <2 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %t3 = srem <2 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %t4 = urem <2 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f1 = sdiv <4 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %f2 = udiv <4 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %f3 = srem <4 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %f4 = urem <4 x i64> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e1 = sdiv <8 x i64> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %e2 = udiv <8 x i64> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e3 = srem <8 x i64> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e4 = urem <8 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %s1 = sdiv <16 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %s2 = udiv <16 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s3 = srem <16 x i64> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %s4 = urem <16 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %s1 = sdiv <16 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %s2 = udiv <16 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %s3 = srem <16 x i64> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %s4 = urem <16 x i64> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vi64_2' @@ -1498,12 +1498,12 @@ define void @vf16_2() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vf16_2' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %1 = fdiv <2 x half> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %2 = frem <2 x half> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %3 = fdiv <4 x half> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %4 = frem <4 x half> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %5 = fdiv <8 x half> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %6 = frem <8 x half> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = fdiv <2 x half> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = frem <2 x half> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = fdiv <4 x half> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %4 = frem <4 x half> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %5 = fdiv <8 x half> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %6 = frem <8 x half> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vf16_2' @@ -1553,12 +1553,12 @@ define void @vf32_2() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vf32_2' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %1 = fdiv <2 x float> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %2 = frem <2 x float> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %3 = fdiv <4 x float> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %4 = frem <4 x float> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %5 = fdiv <8 x float> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %6 = frem <8 x float> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = fdiv <2 x float> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = frem <2 x float> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = fdiv <4 x float> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %4 = frem <4 x float> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %5 = fdiv <8 x float> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %6 = frem <8 x float> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vf32_2' @@ -1608,12 +1608,12 @@ define void @vf64_2() { ; CHECK-NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVE-LABEL: 'vf64_2' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %1 = fdiv <2 x double> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %2 = frem <2 x double> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %3 = fdiv <4 x double> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %4 = frem <4 x double> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %5 = fdiv <8 x double> undef, -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %6 = frem <8 x double> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %1 = fdiv <2 x double> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %2 = frem <2 x double> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %3 = fdiv <4 x double> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %4 = frem <4 x double> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %5 = fdiv <8 x double> undef, +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %6 = frem <8 x double> undef, ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-V8M-MAIN-LABEL: 'vf64_2' diff --git a/llvm/test/Analysis/CostModel/ARM/fparith.ll b/llvm/test/Analysis/CostModel/ARM/fparith.ll index cb3d66edfa20..3403ab25e490 100644 --- a/llvm/test/Analysis/CostModel/ARM/fparith.ll +++ b/llvm/test/Analysis/CostModel/ARM/fparith.ll @@ -61,15 +61,15 @@ define void @f64() { define void @vf32() { ; CHECK-MVE-LABEL: 'vf32' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = fadd <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = fsub <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = fmul <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c4 = fadd <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d4 = fsub <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e4 = fmul <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %c8 = fadd <8 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %d8 = fsub <8 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e8 = fmul <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x float> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVEFP-LABEL: 'vf32' @@ -98,15 +98,15 @@ define void @vf32() { define void @vf16() { ; CHECK-MVE-LABEL: 'vf16' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = fadd <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = fsub <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = fmul <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c4 = fadd <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d4 = fsub <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e4 = fmul <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %c8 = fadd <8 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %d8 = fsub <8 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e8 = fmul <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x half> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVEFP-LABEL: 'vf16' @@ -135,27 +135,27 @@ define void @vf16() { define void @vf64() { ; CHECK-MVE-LABEL: 'vf64' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = fadd <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = fsub <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = fmul <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c4 = fadd <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d4 = fsub <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e4 = fmul <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %c8 = fadd <8 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %d8 = fsub <8 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e8 = fmul <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVEFP-LABEL: 'vf64' -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %c2 = fadd <2 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %d2 = fsub <2 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %e2 = fmul <2 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %c4 = fadd <4 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %d4 = fsub <4 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %e4 = fmul <4 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %c8 = fadd <8 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %d8 = fsub <8 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %e8 = fmul <8 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef ; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %c2 = fadd <2 x double> undef, undef diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll index f64393cbe88c..b8a6a9569c9a 100644 --- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll +++ b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll @@ -95,7 +95,7 @@ define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float define void @log2(float %a, <16 x float> %va) { ; THRU-LABEL: 'log2' ; THRU-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a) -; THRU-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) +; THRU-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'log2' @@ -105,12 +105,12 @@ define void @log2(float %a, <16 x float> %va) { ; ; SIZE-LABEL: 'log2' ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.log2.f32(float %a) -; SIZE-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) +; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'log2' ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 672 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %s = call float @llvm.log2.f32(float %a) @@ -121,7 +121,7 @@ define void @log2(float %a, <16 x float> %va) { define void @constrained_fadd(float %a, <16 x float> %va) { ; THRU-LABEL: 'constrained_fadd' ; THRU-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; THRU-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") +; THRU-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'constrained_fadd' @@ -131,12 +131,12 @@ define void @constrained_fadd(float %a, <16 x float> %va) { ; ; SIZE-LABEL: 'constrained_fadd' ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; SIZE-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") +; SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'constrained_fadd' ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore") ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") @@ -147,7 +147,7 @@ define void @constrained_fadd(float %a, <16 x float> %va) { define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) { ; THRU-LABEL: 'fmaximum' ; THRU-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; THRU-NEXT: Cost Model: Found an estimated cost of 928 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; THRU-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'fmaximum' @@ -157,12 +157,12 @@ define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) { ; ; SIZE-LABEL: 'fmaximum' ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; SIZE-NEXT: Cost Model: Found an estimated cost of 784 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'fmaximum' ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 928 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %s = call float @llvm.maximum.f32(float %a, float %b) @@ -225,7 +225,7 @@ define void @ctlz(i32 %a, <16 x i32> %va) { define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) { ; THRU-LABEL: 'fshl' ; THRU-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) -; THRU-NEXT: Cost Model: Found an estimated cost of 832 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) +; THRU-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'fshl' @@ -235,12 +235,12 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x ; ; SIZE-LABEL: 'fshl' ; SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) -; SIZE-NEXT: Cost Model: Found an estimated cost of 805 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) +; SIZE-NEXT: Cost Model: Found an estimated cost of 229 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'fshl' ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 826 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 250 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c) @@ -250,7 +250,7 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x define void @maskedgather(<16 x float*> %va, <16 x i1> %vb, <16 x float> %vc) { ; THRU-LABEL: 'maskedgather' -; THRU-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) +; THRU-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'maskedgather' @@ -258,11 +258,11 @@ define void @maskedgather(<16 x float*> %va, <16 x i1> %vb, <16 x float> %vc) { ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'maskedgather' -; SIZE-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) +; SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'maskedgather' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %va, i32 1, <16 x i1> %vb, <16 x float> %vc) @@ -271,7 +271,7 @@ define void @maskedgather(<16 x float*> %va, <16 x i1> %vb, <16 x float> %vc) { define void @maskedscatter(<16 x float> %va, <16 x float*> %vb, <16 x i1> %vc) { ; THRU-LABEL: 'maskedscatter' -; THRU-NEXT: Cost Model: Found an estimated cost of 576 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc) +; THRU-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'maskedscatter' @@ -279,11 +279,11 @@ define void @maskedscatter(<16 x float> %va, <16 x float*> %vb, <16 x i1> %vc) { ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'maskedscatter' -; SIZE-NEXT: Cost Model: Found an estimated cost of 576 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc) +; SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'maskedscatter' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 576 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %va, <16 x float*> %vb, i32 1, <16 x i1> %vc) @@ -292,7 +292,7 @@ define void @maskedscatter(<16 x float> %va, <16 x float*> %vb, <16 x i1> %vc) { define void @reduce_fmax(<16 x float> %va) { ; THRU-LABEL: 'reduce_fmax' -; THRU-NEXT: Cost Model: Found an estimated cost of 696 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) +; THRU-NEXT: Cost Model: Found an estimated cost of 133 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) ; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; LATE-LABEL: 'reduce_fmax' @@ -300,11 +300,11 @@ define void @reduce_fmax(<16 x float> %va) { ; LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE-LABEL: 'reduce_fmax' -; SIZE-NEXT: Cost Model: Found an estimated cost of 685 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) +; SIZE-NEXT: Cost Model: Found an estimated cost of 122 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; ; SIZE_LATE-LABEL: 'reduce_fmax' -; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 694 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) +; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 131 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) ; SIZE_LATE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va) diff --git a/llvm/test/Analysis/CostModel/ARM/load_store.ll b/llvm/test/Analysis/CostModel/ARM/load_store.ll index 2ca4acda0fc2..52f6e3cccac6 100644 --- a/llvm/test/Analysis/CostModel/ARM/load_store.ll +++ b/llvm/test/Analysis/CostModel/ARM/load_store.ll @@ -69,16 +69,16 @@ define void @stores() { ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, i128* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store float undef, float* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: store double undef, double* undef, align 4 -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i8> undef, <2 x i8>* undef, align 1 -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i16> undef, <2 x i16>* undef, align 2 -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x i32> undef, <2 x i32>* undef, align 4 +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i8> undef, <2 x i8>* undef, align 1 +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i16> undef, <2 x i16>* undef, align 2 +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: store <2 x i32> undef, <2 x i32>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, <2 x i64>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, <4 x i32>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, <8 x i16>* undef, align 2 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> undef, <16 x i8>* undef, align 1 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, <4 x float>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, <4 x double>* undef, align 4 -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: store <2 x float> undef, <2 x float>* undef, align 4 +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: store <2 x float> undef, <2 x float>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, <2 x double>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, <2 x i64>* undef, align 1 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, <4 x i32>* undef, align 1 @@ -256,16 +256,16 @@ define void @loads() { ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, i128* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, float* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, double* undef, align 4 -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %8 = load <2 x i8>, <2 x i8>* undef, align 1 -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %9 = load <2 x i16>, <2 x i16>* undef, align 2 -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %10 = load <2 x i32>, <2 x i32>* undef, align 4 +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %8 = load <2 x i8>, <2 x i8>* undef, align 1 +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %9 = load <2 x i16>, <2 x i16>* undef, align 2 +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %10 = load <2 x i32>, <2 x i32>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %11 = load <2 x i64>, <2 x i64>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = load <4 x i32>, <4 x i32>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %13 = load <8 x i16>, <8 x i16>* undef, align 2 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %14 = load <16 x i8>, <16 x i8>* undef, align 1 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %15 = load <4 x float>, <4 x float>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, <4 x double>* undef, align 4 -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %17 = load <2 x float>, <2 x float>* undef, align 4 +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %17 = load <2 x float>, <2 x float>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, <2 x double>* undef, align 4 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = load <2 x i64>, <2 x i64>* undef, align 1 ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %20 = load <4 x i32>, <4 x i32>* undef, align 1 diff --git a/llvm/test/Analysis/CostModel/ARM/mve-abs.ll b/llvm/test/Analysis/CostModel/ARM/mve-abs.ll index a69531c0379a..ace2d1bee070 100644 --- a/llvm/test/Analysis/CostModel/ARM/mve-abs.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-abs.ll @@ -31,22 +31,22 @@ declare <64 x i8> @llvm.abs.v64i8(<64 x i8>, i1) define i32 @abs(i32 %arg) { ; MVE-RECIP-LABEL: 'abs' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 448 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false) @@ -56,22 +56,22 @@ define i32 @abs(i32 %arg) { ; ; MVE-SIZE-LABEL: 'abs' ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %I64 = call i64 @llvm.abs.i64(i64 undef, i1 false) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> undef, i1 false) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 146 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 undef, i1 false) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I32 = call <2 x i32> @llvm.abs.v2i32(<2 x i32> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 undef, i1 false) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I16 = call <2 x i16> @llvm.abs.v2i16(<2 x i16> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I16 = call <4 x i16> @llvm.abs.v4i16(<4 x i16> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 undef, i1 false) -; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false) +; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2I8 = call <2 x i8> @llvm.abs.v2i8(<2 x i8> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4I8 = call <4 x i8> @llvm.abs.v4i8(<4 x i8> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V8I8 = call <8 x i8> @llvm.abs.v8i8(<8 x i8> undef, i1 false) ; MVE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> undef, i1 false) diff --git a/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll b/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll index ca53d85e556a..c2af5e1cacb5 100644 --- a/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-cmp.ll @@ -4,21 +4,21 @@ define void @icmp() { ; CHECK-LABEL: 'icmp' -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = icmp slt <2 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2i8 = icmp slt <2 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = icmp slt <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = icmp slt <8 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = icmp slt <16 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1028 for instruction: %v32i8 = icmp slt <32 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = icmp slt <2 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 132 for instruction: %v32i8 = icmp slt <32 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2i16 = icmp slt <2 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = icmp slt <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = icmp slt <8 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 260 for instruction: %v16i16 = icmp slt <16 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = icmp slt <2 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v16i16 = icmp slt <16 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v2i32 = icmp slt <2 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = icmp slt <4 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v8i32 = icmp slt <8 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %v16i32 = icmp slt <16 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = icmp slt <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4i64 = icmp slt <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v8i32 = icmp slt <8 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v16i32 = icmp slt <16 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v2i64 = icmp slt <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v4i64 = icmp slt <4 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %v8i64 = icmp slt <8 x i64> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; @@ -47,31 +47,31 @@ define void @icmp() { define void @fcmp() { ; CHECK-MVE-LABEL: 'fcmp' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %v16f32 = fcmp olt <16 x float> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = fcmp olt <2 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v4f64 = fcmp olt <4 x double> undef, undef -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %v8f64 = fcmp olt <8 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16f32 = fcmp olt <16 x float> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = fcmp olt <2 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4f64 = fcmp olt <4 x double> undef, undef +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = fcmp olt <8 x double> undef, undef ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-MVEFP-LABEL: 'fcmp' ; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = fcmp olt <2 x half> undef, undef ; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fcmp olt <4 x half> undef, undef ; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = fcmp olt <8 x half> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 260 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v16f16 = fcmp olt <16 x half> undef, undef ; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = fcmp olt <2 x float> undef, undef ; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = fcmp olt <4 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %v16f32 = fcmp olt <16 x float> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = fcmp olt <2 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = fcmp olt <4 x double> undef, undef -; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %v8f64 = fcmp olt <8 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %v8f32 = fcmp olt <8 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %v16f32 = fcmp olt <16 x float> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = fcmp olt <2 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = fcmp olt <4 x double> undef, undef +; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f64 = fcmp olt <8 x double> undef, undef ; CHECK-MVEFP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %v2f16 = fcmp olt <2 x half> undef, undef diff --git a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll index fe75e5087ec4..c368991faea6 100644 --- a/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-gather-scatter-cost.ll @@ -3,32 +3,32 @@ define i32 @masked_gather() { ; CHECK-LABEL: 'masked_gather' -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 4, <4 x i1> undef, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 4, <2 x i1> undef, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 4, <8 x i1> undef, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 4, <4 x i1> undef, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 4, <2 x i1> undef, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 4, <8 x i1> undef, <8 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 2, <16 x i1> undef, <16 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 2, <8 x i1> undef, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 2, <4 x i1> undef, <4 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 2, <2 x i1> undef, <2 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 4, <4 x i1> undef, <4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 4, <16 x i1> undef, <16 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 4, <8 x i1> undef, <8 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V16F16 = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 2, <16 x i1> undef, <16 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 2, <8 x i1> undef, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 2, <4 x i1> undef, <4 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 2, <2 x i1> undef, <2 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V4I64 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 4, <4 x i1> undef, <4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V2I64 = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: %V16I32 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 4, <16 x i1> undef, <16 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V8I32 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 4, <8 x i1> undef, <8 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 4, <4 x i1> undef, <4 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 4, <2 x i1> undef, <2 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 544 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 2, <16 x i1> undef, <16 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 2, <8 x i1> undef, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I32 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 4, <2 x i1> undef, <2 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %V16I16 = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 2, <16 x i1> undef, <16 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I16 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 2, <8 x i1> undef, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I16 = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 2, <4 x i1> undef, <4 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 2, <2 x i1> undef, <2 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2112 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I16 = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 2, <2 x i1> undef, <2 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4I32p = call <4 x i32*> @llvm.masked.gather.v4p0i32.v4p0p0i32(<4 x i32**> undef, i32 4, <4 x i1> undef, <4 x i32*> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V4I32p = call <4 x i32*> @llvm.masked.gather.v4p0i32.v4p0p0i32(<4 x i32**> undef, i32 4, <4 x i1> undef, <4 x i32*> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 4, <4 x i1> undef, <4 x double> undef) @@ -70,31 +70,31 @@ define i32 @masked_gather() { define i32 @masked_scatter() { ; CHECK-LABEL: 'masked_scatter' -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 4, <4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 4, <2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 576 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 4, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 4, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 4, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 4, <8 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 4, <4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 4, <2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 544 for instruction: call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 2, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 2, <8 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> undef, <4 x half*> undef, i32 2, <4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> undef, <2 x half*> undef, i32 2, <2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 4, <4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 4, <2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 576 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 4, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 4, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 2, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 2, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> undef, <4 x half*> undef, i32 2, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> undef, <2 x half*> undef, i32 2, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 4, <4 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 192 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 4, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 4, <8 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 4, <4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 4, <2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 544 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 2, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 2, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 4, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 2, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 2, <8 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 2, <4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> undef, <2 x i16*> undef, i32 2, <2 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2112 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> undef, <2 x i16*> undef, i32 2, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 320 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0 ; call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 4, <4 x i1> undef) @@ -205,8 +205,8 @@ define void @gep_v4f32(float* %base, i16* %base16, i8* %base8, <4 x i32> %ind32, ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %res3 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep3, i32 4, <4 x i1> %mask, <4 x float> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %res3, <4 x float*> %gep3, i32 4, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gepu = getelementptr float, float* %base, <4 x i32> %ind32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resu, <4 x float*> %gepu, i32 1, <4 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %resu = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gepu, i32 1, <4 x i1> %mask, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %resu, <4 x float*> %gepu, i32 1, <4 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <4 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <4 x i8*> %gepos to <4 x float*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %resos = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %geposb, i32 4, <4 x i1> %mask, <4 x float> undef) @@ -336,26 +336,26 @@ define void @gep_v4i8(i8* %base, <4 x i8> %ind8, <4 x i1> %mask) { define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i8> %ind8, <8 x i1> %mask) { ; CHECK-LABEL: 'gep_v8i16' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i16, i16* %base, <8 x i32> %ind32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %res1 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep1, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res1, <8 x i16*> %gep1, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i16, i16* %base, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 2, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res2, <8 x i16*> %gep2, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i16, i16* %base, <8 x i32> %indsext -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res3, <8 x i16*> %gep3, i32 2, <8 x i1> %mask) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resu, <8 x i16*> %gep2, i32 1, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %res3 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep3, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %res3, <8 x i16*> %gep3, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %resu = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gep2, i32 1, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resu, <8 x i16*> %gep2, i32 1, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x i16*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %resos = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %geposb, i32 2, <8 x i1> %mask, <8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resos, <8 x i16*> %geposb, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x i16*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %resbs = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gepbsb, i32 2, <8 x i1> %mask, <8 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %resbs, <8 x i16*> %gepbsb, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext4 = zext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i16, i16* %base, <8 x i32> %indzext4 ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %indtrunc = trunc <8 x i32> %ind32 to <8 x i16> @@ -417,26 +417,26 @@ define void @gep_v8i16(i16* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, < define void @gep_v8f16(half* %base, i8* %base8, i32* %base32, <8 x i32> %ind32, <8 x i16> %ind16, <8 x i1> %mask) { ; CHECK-LABEL: 'gep_v8f16' ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep1 = getelementptr half, half* %base, <8 x i32> %ind32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res1 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep1, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res1, <8 x half*> %gep1, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indzext = zext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep2 = getelementptr half, half* %base, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %res2 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 2, <8 x i1> %mask, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res2, <8 x half*> %gep2, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %indsext = sext <8 x i16> %ind16 to <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %gep3 = getelementptr half, half* %base, <8 x i32> %indsext -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res3, <8 x half*> %gep3, i32 2, <8 x i1> %mask) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resu, <8 x half*> %gep2, i32 1, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %res3 = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep3, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %res3, <8 x half*> %gep3, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %resu = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gep2, i32 1, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resu, <8 x half*> %gep2, i32 1, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepos = getelementptr i8, i8* %base8, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %geposb = bitcast <8 x i8*> %gepos to <8 x half*> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %resos = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %geposb, i32 2, <8 x i1> %mask, <8 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resos, <8 x half*> %geposb, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i32, i32* %base32, <8 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <8 x i32*> %gepbs to <8 x half*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 136 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %resbs = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gepbsb, i32 2, <8 x i1> %mask, <8 x half> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %resbs, <8 x half*> %gepbsb, i32 2, <8 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; no offset ext @@ -509,20 +509,20 @@ define void @gep_v8i8(i8* %base, <8 x i8> %ind8, <8 x i1> %mask) { define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind32, <16 x i1> %mask) { ; CHECK-LABEL: 'gep_v16i8' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep1 = getelementptr i8, i8* %base, <16 x i32> %ind32 -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %res1 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep1, i32 1, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res1, <16 x i8*> %gep1, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext = zext <16 x i8> %ind8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep2 = getelementptr i8, i8* %base, <16 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %res2 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep2, i32 2, <16 x i1> %mask, <16 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res2, <16 x i8*> %gep2, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indsext = sext <16 x i8> %ind8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep3 = getelementptr i8, i8* %base, <16 x i32> %indsext -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res3, <16 x i8*> %gep3, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %res3 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gep3, i32 2, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %res3, <16 x i8*> %gep3, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i16, i16* %base16, <16 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbsb = bitcast <16 x i16*> %gepbs to <16 x i8*> -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbsb, i32 2, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbsb, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %indzext4 = zext <16 x i8> %ind8 to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gep4 = getelementptr i8, i8* %base, <16 x i32> %indzext ; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %indtrunc = trunc <16 x i32> %ind32 to <16 x i8> @@ -564,8 +564,8 @@ define void @gep_v16i8(i8* %base, i16* %base16, <16 x i8> %ind8, <16 x i32> %ind define void @gep_v16i8p(<16 x i8*> %base, i32 %off, <16 x i1> %mask) { ; CHECK-LABEL: 'gep_v16i8p' ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %gepbs = getelementptr i8, <16 x i8*> %base, i32 %off -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 528 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbs, i32 2, <16 x i1> %mask) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %resbs = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gepbs, i32 2, <16 x i1> %mask, <16 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 144 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %resbs, <16 x i8*> %gepbs, i32 2, <16 x i1> %mask) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %gepbs = getelementptr i8, <16 x i8*> %base, i32 %off diff --git a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll index ede3bdda30ca..b075ba754904 100644 --- a/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-minmax.ll @@ -33,22 +33,22 @@ declare <64 x i8> @llvm.smin.v64i8(<64 x i8>, <64 x i8>) define i32 @smin(i32 %arg) { ; MVE-RECIP-LABEL: 'smin' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.smin.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2I64 = call <2 x i64> @llvm.smin.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V4I64 = call <4 x i64> @llvm.smin.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V8I64 = call <8 x i64> @llvm.smin.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smin.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.smin.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smin.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.smin.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.smin.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smin.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.smin.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smin.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smin.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.smin.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.smin.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smin.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.smin.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smin.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smin.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smin.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -140,22 +140,22 @@ declare <64 x i8> @llvm.smax.v64i8(<64 x i8>, <64 x i8>) define i32 @smax(i32 %arg) { ; MVE-RECIP-LABEL: 'smax' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.smax.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2I64 = call <2 x i64> @llvm.smax.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V4I64 = call <4 x i64> @llvm.smax.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V8I64 = call <8 x i64> @llvm.smax.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.smax.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.smax.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.smax.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.smax.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.smax.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.smax.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.smax.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.smax.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.smax.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.smax.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.smax.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.smax.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.smax.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.smax.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.smax.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.smax.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -248,22 +248,22 @@ declare <64 x i8> @llvm.umin.v64i8(<64 x i8>, <64 x i8>) define i32 @umin(i32 %arg) { ; MVE-RECIP-LABEL: 'umin' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.umin.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2I64 = call <2 x i64> @llvm.umin.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V4I64 = call <4 x i64> @llvm.umin.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V8I64 = call <8 x i64> @llvm.umin.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umin.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.umin.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umin.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.umin.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.umin.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umin.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.umin.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umin.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umin.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umin.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umin.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umin.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.umin.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umin.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umin.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umin.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -355,22 +355,22 @@ declare <64 x i8> @llvm.umax.v64i8(<64 x i8>, <64 x i8>) define i32 @sub(i32 %arg) { ; MVE-RECIP-LABEL: 'sub' ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.umax.i64(i64 undef, i64 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2I64 = call <2 x i64> @llvm.umax.v2i64(<2 x i64> undef, <2 x i64> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 152 for instruction: %V4I64 = call <4 x i64> @llvm.umax.v4i64(<4 x i64> undef, <4 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 304 for instruction: %V8I64 = call <8 x i64> @llvm.umax.v8i64(<8 x i64> undef, <8 x i64> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.umax.i32(i32 undef, i32 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I32 = call <2 x i32> @llvm.umax.v2i32(<2 x i32> undef, <2 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> undef, <4 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.umax.v8i32(<8 x i32> undef, <8 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = call <16 x i32> @llvm.umax.v16i32(<16 x i32> undef, <16 x i32> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I16 = call i16 @llvm.umax.i16(i16 undef, i16 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I16 = call <2 x i16> @llvm.umax.v2i16(<2 x i16> undef, <2 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.umax.v4i16(<4 x i16> undef, <4 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = call <8 x i16> @llvm.umax.v8i16(<8 x i16> undef, <8 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = call <16 x i16> @llvm.umax.v16i16(<16 x i16> undef, <16 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = call <32 x i16> @llvm.umax.v32i16(<32 x i16> undef, <32 x i16> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.umax.i8(i8 undef, i8 undef) -; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef) +; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V2I8 = call <2 x i8> @llvm.umax.v2i8(<2 x i8> undef, <2 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4I8 = call <4 x i8> @llvm.umax.v4i8(<4 x i8> undef, <4 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8I8 = call <8 x i8> @llvm.umax.v8i8(<8 x i8> undef, <8 x i8> undef) ; MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = call <16 x i8> @llvm.umax.v16i8(<16 x i8> undef, <16 x i8> undef) @@ -455,45 +455,45 @@ declare <32 x half> @llvm.minnum.v32f16(<32 x half>, <32 x half>) define float @minnum(float %arg) { ; MVEI-RECIP-LABEL: 'minnum' ; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.minnum.f64(double undef, double undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) ; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef) ; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F16 = call half @llvm.minnum.f16(half undef, half undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 1344 for instruction: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef) ; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float undef ; ; MVEI-SIZE-LABEL: 'minnum' ; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.minnum.f64(double undef, double undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) ; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.minnum.v8f32(<8 x float> undef, <8 x float> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.minnum.v16f32(<16 x float> undef, <16 x float> undef) ; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.minnum.f16(half undef, half undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1056 for instruction: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <2 x half> @llvm.minnum.v2f16(<2 x half> undef, <2 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <4 x half> @llvm.minnum.v4f16(<4 x half> undef, <4 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <8 x half> @llvm.minnum.v8f16(<8 x half> undef, <8 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <16 x half> @llvm.minnum.v16f16(<16 x half> undef, <16 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <32 x half> @llvm.minnum.v32f16(<32 x half> undef, <32 x half> undef) ; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef ; ; MVEF-RECIP-LABEL: 'minnum' ; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.minnum.f64(double undef, double undef) -; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) -; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) -; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) +; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) +; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) +; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) ; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef) ; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef) ; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef) @@ -509,9 +509,9 @@ define float @minnum(float %arg) { ; ; MVEF-SIZE-LABEL: 'minnum' ; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.minnum.f64(double undef, double undef) -; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) -; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) -; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) +; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.minnum.v2f64(<2 x double> undef, <2 x double> undef) +; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.minnum.v4f64(<4 x double> undef, <4 x double> undef) +; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.minnum.v8f64(<8 x double> undef, <8 x double> undef) ; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.minnum.f32(float undef, float undef) ; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.minnum.v2f32(<2 x float> undef, <2 x float> undef) ; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.minnum.v4f32(<4 x float> undef, <4 x float> undef) @@ -567,45 +567,45 @@ declare <32 x half> @llvm.maxnum.v32f16(<32 x half>, <32 x half>) define float @maxnum(float %arg) { ; MVEI-RECIP-LABEL: 'maxnum' ; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.maxnum.f64(double undef, double undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) ; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef) ; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F16 = call half @llvm.maxnum.f16(half undef, half undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef) -; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 1344 for instruction: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 176 for instruction: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef) +; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 352 for instruction: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef) ; MVEI-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float undef ; ; MVEI-SIZE-LABEL: 'maxnum' ; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.maxnum.f64(double undef, double undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) ; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F32 = call <8 x float> @llvm.maxnum.v8f32(<8 x float> undef, <8 x float> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16F32 = call <16 x float> @llvm.maxnum.v16f32(<16 x float> undef, <16 x float> undef) ; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F16 = call half @llvm.maxnum.f16(half undef, half undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 272 for instruction: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef) -; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1056 for instruction: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F16 = call <2 x half> @llvm.maxnum.v2f16(<2 x half> undef, <2 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F16 = call <4 x half> @llvm.maxnum.v4f16(<4 x half> undef, <4 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F16 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> undef, <8 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V16F16 = call <16 x half> @llvm.maxnum.v16f16(<16 x half> undef, <16 x half> undef) +; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V32F16 = call <32 x half> @llvm.maxnum.v32f16(<32 x half> undef, <32 x half> undef) ; MVEI-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret float undef ; ; MVEF-RECIP-LABEL: 'maxnum' ; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.maxnum.f64(double undef, double undef) -; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) -; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) -; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) +; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) +; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) +; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 88 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) ; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef) ; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef) ; MVEF-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef) @@ -621,9 +621,9 @@ define float @maxnum(float %arg) { ; ; MVEF-SIZE-LABEL: 'maxnum' ; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.maxnum.f64(double undef, double undef) -; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) -; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) -; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) +; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = call <2 x double> @llvm.maxnum.v2f64(<2 x double> undef, <2 x double> undef) +; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> undef, <4 x double> undef) +; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8F64 = call <8 x double> @llvm.maxnum.v8f64(<8 x double> undef, <8 x double> undef) ; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.maxnum.f32(float undef, float undef) ; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2F32 = call <2 x float> @llvm.maxnum.v2f32(<2 x float> undef, <2 x float> undef) ; MVEF-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V4F32 = call <4 x float> @llvm.maxnum.v4f32(<4 x float> undef, <4 x float> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll b/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll index 8b59de716fcc..c670d6f6baee 100644 --- a/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll +++ b/llvm/test/Analysis/CostModel/ARM/mve-vecreduce-add.ll @@ -3,8 +3,8 @@ define void @add_i8() { ; CHECK-LABEL: 'add_i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a3 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a4 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) @@ -26,13 +26,13 @@ define void @add_i8() { define void @add_i16() { ; CHECK-LABEL: 'add_i16' ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a2za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2sa = sext <4 x i8> undef to <4 x i16> @@ -45,8 +45,8 @@ define void @add_i16() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a4z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %a4sa = sext <16 x i8> undef to <16 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sa) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a8 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a9 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) @@ -98,13 +98,13 @@ define void @add_i16() { define void @add_i32() { ; CHECK-LABEL: 'add_i32' ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a2za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a2sa = sext <4 x i8> undef to <4 x i32> @@ -118,13 +118,13 @@ define void @add_i32() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %a4sa = sext <16 x i8> undef to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a4s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a4sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a7za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7sa = sext <4 x i16> undef to <4 x i32> @@ -137,8 +137,8 @@ define void @add_i32() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a9z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %a9sa = sext <16 x i16> undef to <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sa) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a13 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a14 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) @@ -219,71 +219,71 @@ define void @add_i32() { define void @add_i64() { ; CHECK-LABEL: 'add_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a0za = zext <1 x i8> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a0za = zext <1 x i8> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a5za = zext <1 x i16> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a5za = zext <1 x i16> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a10za = zext <1 x i32> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a10za = zext <1 x i32> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sa) ; CHECK-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za) +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14za) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa) -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sa) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %a0za = zext <1 x i8> undef to <1 x i64> @@ -392,9 +392,9 @@ define void @add_i64() { define void @mla_i8() { ; CHECK-LABEL: 'mla_i8' ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0m = mul <1 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a1m = mul <2 x i8> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0 = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %a0m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %a1m = mul <2 x i8> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %a1m) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2m = mul <4 x i8> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %a2m) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a3m = mul <8 x i8> undef, undef @@ -426,19 +426,19 @@ define void @mla_i16() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i16> %a0za, %a0zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i16> %a0sa, %a0sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a0sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %a1zm = mul <2 x i16> %a1za, %a1zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %a1zm = mul <2 x i16> %a1za, %a1zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i16> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %a1sm = mul <2 x i16> %a1sa, %a1sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %a1sm = mul <2 x i16> %a1sa, %a1sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a1sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2za = zext <4 x i8> undef to <4 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2zb = zext <4 x i8> undef to <4 x i16> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i16> %a2za, %a2zb @@ -464,9 +464,9 @@ define void @mla_i16() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a4sm = mul <16 x i16> %a4sa, %a4sb ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a4s = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %a4sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5m = mul <1 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a6m = mul <2 x i16> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a5 = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %a5m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %a6m = mul <2 x i16> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a6 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %a6m) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7m = mul <4 x i16> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a7m) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a8m = mul <8 x i16> undef, undef @@ -548,19 +548,19 @@ define void @mla_i32() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0za = zext <1 x i8> undef to <1 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0zb = zext <1 x i8> undef to <1 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0zm = mul <1 x i32> %a0za, %a0zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0sa = sext <1 x i8> undef to <1 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0sb = sext <1 x i8> undef to <1 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a0sm = mul <1 x i32> %a0sa, %a0sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a0sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a1za = zext <2 x i8> undef to <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a1zb = zext <2 x i8> undef to <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %a1zm = mul <2 x i32> %a1za, %a1zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %a1zm = mul <2 x i32> %a1za, %a1zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1sa = sext <2 x i8> undef to <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1sb = sext <2 x i8> undef to <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %a1sm = mul <2 x i32> %a1sa, %a1sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %a1sm = mul <2 x i32> %a1sa, %a1sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a1s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a1sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a2za = zext <4 x i8> undef to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a2zb = zext <4 x i8> undef to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a2zm = mul <4 x i32> %a2za, %a2zb @@ -588,19 +588,19 @@ define void @mla_i32() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5za = zext <1 x i16> undef to <1 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5zb = zext <1 x i16> undef to <1 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5zm = mul <1 x i32> %a5za, %a5zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a5z = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5sa = sext <1 x i16> undef to <1 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5sb = sext <1 x i16> undef to <1 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a5sm = mul <1 x i32> %a5sa, %a5sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a5s = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a5sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a6za = zext <2 x i16> undef to <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a6zb = zext <2 x i16> undef to <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %a6zm = mul <2 x i32> %a6za, %a6zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %a6zm = mul <2 x i32> %a6za, %a6zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a6z = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a6sa = sext <2 x i16> undef to <2 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a6sb = sext <2 x i16> undef to <2 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %a6sm = mul <2 x i32> %a6sa, %a6sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %a6sm = mul <2 x i32> %a6sa, %a6sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a6s = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a6sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7za = zext <4 x i16> undef to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7zb = zext <4 x i16> undef to <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a7zm = mul <4 x i32> %a7za, %a7zb @@ -626,9 +626,9 @@ define void @mla_i32() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a9sm = mul <16 x i32> %a9sa, %a9sb ; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a9s = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %a9sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a10m = mul <1 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a11m = mul <2 x i32> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a10 = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %a10m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %a11m = mul <2 x i32> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %a11 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a11m) ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a12m = mul <4 x i32> undef, undef ; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %a12 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a12m) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a13m = mul <8 x i32> undef, undef @@ -757,136 +757,136 @@ define void @mla_i32() { define void @mla_i64() { ; CHECK-LABEL: 'mla_i64' -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a0za = zext <1 x i8> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a0zb = zext <1 x i8> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a0za = zext <1 x i8> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a0zb = zext <1 x i8> undef to <1 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0zm = mul <1 x i64> %a0za, %a0zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a0sb = sext <1 x i8> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a0z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a0sa = sext <1 x i8> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a0sb = sext <1 x i8> undef to <1 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a0sm = mul <1 x i64> %a0sa, %a0sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a0s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a0sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1za = zext <2 x i8> undef to <2 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a1zb = zext <2 x i8> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a1zm = mul <2 x i64> %a1za, %a1zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %a1zm = mul <2 x i64> %a1za, %a1zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a1z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %a1sa = sext <2 x i8> undef to <2 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %a1sb = sext <2 x i8> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a1sm = mul <2 x i64> %a1sa, %a1sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %a1sm = mul <2 x i64> %a1sa, %a1sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a1s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a1sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a2za = zext <4 x i8> undef to <4 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a2zb = zext <4 x i8> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %a2zm = mul <4 x i64> %a2za, %a2zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %a2zm = mul <4 x i64> %a2za, %a2zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a2z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %a2sa = sext <4 x i8> undef to <4 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %a2sb = sext <4 x i8> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %a2sm = mul <4 x i64> %a2sa, %a2sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %a2sm = mul <4 x i64> %a2sa, %a2sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a2s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a2sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %a3za = zext <8 x i8> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %a3zb = zext <8 x i8> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a3zm = mul <8 x i64> %a3za, %a3zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a3z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %a3sa = sext <8 x i8> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %a3sb = sext <8 x i8> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a3sm = mul <8 x i64> %a3sa, %a3sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a3s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a3sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %a4za = zext <16 x i8> undef to <16 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 298 for instruction: %a4zb = zext <16 x i8> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 800 for instruction: %a4zm = mul <16 x i64> %a4za, %a4zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %a4zm = mul <16 x i64> %a4za, %a4zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a4z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %a4sa = sext <16 x i8> undef to <16 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1322 for instruction: %a4sb = sext <16 x i8> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 800 for instruction: %a4sm = mul <16 x i64> %a4sa, %a4sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a5za = zext <1 x i16> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a5zb = zext <1 x i16> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %a4sm = mul <16 x i64> %a4sa, %a4sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a4s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a4sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a5za = zext <1 x i16> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a5zb = zext <1 x i16> undef to <1 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a5zm = mul <1 x i64> %a5za, %a5zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a5sb = sext <1 x i16> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a5z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %a5sa = sext <1 x i16> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %a5sb = sext <1 x i16> undef to <1 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a5sm = mul <1 x i64> %a5sa, %a5sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a5s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a5sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a6za = zext <2 x i16> undef to <2 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a6zb = zext <2 x i16> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a6zm = mul <2 x i64> %a6za, %a6zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %a6zm = mul <2 x i64> %a6za, %a6zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a6z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %a6sa = sext <2 x i16> undef to <2 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %a6sb = sext <2 x i16> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a6sm = mul <2 x i64> %a6sa, %a6sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %a6sm = mul <2 x i64> %a6sa, %a6sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a6s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a6sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a7za = zext <4 x i16> undef to <4 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a7zb = zext <4 x i16> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %a7zm = mul <4 x i64> %a7za, %a7zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %a7zm = mul <4 x i64> %a7za, %a7zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a7z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %a7sa = sext <4 x i16> undef to <4 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 82 for instruction: %a7sb = sext <4 x i16> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %a7sm = mul <4 x i64> %a7sa, %a7sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %a7sm = mul <4 x i64> %a7sa, %a7sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a7s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a7sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %a8za = zext <8 x i16> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 74 for instruction: %a8zb = zext <8 x i16> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a8zm = mul <8 x i64> %a8za, %a8zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a8z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %a8sa = sext <8 x i16> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 330 for instruction: %a8sb = sext <8 x i16> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a8sm = mul <8 x i64> %a8sa, %a8sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a8s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a8sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 296 for instruction: %a9za = zext <16 x i16> undef to <16 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 296 for instruction: %a9zb = zext <16 x i16> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 800 for instruction: %a9zm = mul <16 x i64> %a9za, %a9zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %a9zm = mul <16 x i64> %a9za, %a9zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a9z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %a9sa = sext <16 x i16> undef to <16 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1320 for instruction: %a9sb = sext <16 x i16> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 800 for instruction: %a9sm = mul <16 x i64> %a9sa, %a9sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a10za = zext <1 x i32> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a10zb = zext <1 x i32> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %a9sm = mul <16 x i64> %a9sa, %a9sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a9s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a9sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a10za = zext <1 x i32> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a10zb = zext <1 x i32> undef to <1 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a10zm = mul <1 x i64> %a10za, %a10zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm) -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %a10sb = sext <1 x i32> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a10z = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a10sa = sext <1 x i32> undef to <1 x i64> +; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %a10sb = sext <1 x i32> undef to <1 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a10sm = mul <1 x i64> %a10sa, %a10sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a10s = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a10sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a11za = zext <2 x i32> undef to <2 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a11zb = zext <2 x i32> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a11zm = mul <2 x i64> %a11za, %a11zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %a11zm = mul <2 x i64> %a11za, %a11zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a11z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a11sa = sext <2 x i32> undef to <2 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a11sb = sext <2 x i32> undef to <2 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %a11sm = mul <2 x i64> %a11sa, %a11sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %a11sm = mul <2 x i64> %a11sa, %a11sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a11s = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a11sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a12za = zext <4 x i32> undef to <4 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %a12zb = zext <4 x i32> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %a12zm = mul <4 x i64> %a12za, %a12zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %a12zm = mul <4 x i64> %a12za, %a12zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a12z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a12sa = sext <4 x i32> undef to <4 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a12sb = sext <4 x i32> undef to <4 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %a12sm = mul <4 x i64> %a12sa, %a12sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 104 for instruction: %a12sm = mul <4 x i64> %a12sa, %a12sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a12s = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a12sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %a13za = zext <8 x i32> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %a13zb = zext <8 x i32> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a13zm = mul <8 x i64> %a13za, %a13zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a13z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %a13sa = sext <8 x i32> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 264 for instruction: %a13sb = sext <8 x i32> undef to <8 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a13sm = mul <8 x i64> %a13sa, %a13sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a13s = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a13sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %a14za = zext <16 x i32> undef to <16 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %a14zb = zext <16 x i32> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 800 for instruction: %a14zm = mul <16 x i64> %a14za, %a14zb -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %a14zm = mul <16 x i64> %a14za, %a14zb +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a14z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14zm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1056 for instruction: %a14sa = sext <16 x i32> undef to <16 x i64> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1056 for instruction: %a14sb = sext <16 x i32> undef to <16 x i64> -; CHECK-NEXT: Cost Model: Found an estimated cost of 800 for instruction: %a14sm = mul <16 x i64> %a14sa, %a14sb -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm) +; CHECK-NEXT: Cost Model: Found an estimated cost of 416 for instruction: %a14sm = mul <16 x i64> %a14sa, %a14sb +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a14s = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a14sm) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %a15m = mul <1 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a16m = mul <2 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m) -; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %a17m = mul <4 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %a15 = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a15m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %a16m = mul <2 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 108 for instruction: %a16 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a16m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %a17m = mul <4 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %a17 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %a17m) ; CHECK-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %a18m = mul <8 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 202 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m) -; CHECK-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %a19m = mul <16 x i64> undef, undef -; CHECK-NEXT: Cost Model: Found an estimated cost of 730 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 408 for instruction: %a18 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %a18m) +; CHECK-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %a19m = mul <16 x i64> undef, undef +; CHECK-NEXT: Cost Model: Found an estimated cost of 808 for instruction: %a19 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %a19m) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; %a0za = zext <1 x i8> undef to <1 x i64> diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll index 868ebdc24756..5a72f3be05cc 100644 --- a/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-smax.ll @@ -21,11 +21,11 @@ define i32 @reduce_i64(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i64' -; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V2 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 568 for instruction: %V8 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1128 for instruction: %V16 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> undef) @@ -54,11 +54,11 @@ define i32 @reduce_i32(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 696 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2504 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V8 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V16 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 712 for instruction: %V32 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> undef) @@ -89,12 +89,12 @@ define i32 @reduce_i16(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2976 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 10160 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 532 for instruction: %V16 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 860 for instruction: %V32 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1516 for instruction: %V64 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> undef) @@ -128,13 +128,13 @@ define i32 @reduce_i8(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 12844 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 41532 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V16 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1304 for instruction: %V32 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1952 for instruction: %V64 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 3248 for instruction: %V128 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll index 61ec7a4637d7..cd4da5435436 100644 --- a/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-smin.ll @@ -21,11 +21,11 @@ define i32 @reduce_i64(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i64' -; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V2 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 568 for instruction: %V8 = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1128 for instruction: %V16 = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> undef) @@ -54,11 +54,11 @@ define i32 @reduce_i32(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 696 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2504 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V8 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V16 = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 712 for instruction: %V32 = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> undef) @@ -89,12 +89,12 @@ define i32 @reduce_i16(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2976 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 10160 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 532 for instruction: %V16 = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 860 for instruction: %V32 = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1516 for instruction: %V64 = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> undef) @@ -128,13 +128,13 @@ define i32 @reduce_i8(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 12844 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 41532 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V16 = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1304 for instruction: %V32 = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1952 for instruction: %V64 = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 3248 for instruction: %V128 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll index 96f24f3c92e9..0447e3826ca5 100644 --- a/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-umax.ll @@ -21,11 +21,11 @@ define i32 @reduce_i64(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i64' -; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V2 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 568 for instruction: %V8 = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1128 for instruction: %V16 = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> undef) @@ -54,11 +54,11 @@ define i32 @reduce_i32(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 696 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2504 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V8 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V16 = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 712 for instruction: %V32 = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> undef) @@ -89,12 +89,12 @@ define i32 @reduce_i16(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2976 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 10160 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 532 for instruction: %V16 = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 860 for instruction: %V32 = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1516 for instruction: %V64 = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> undef) @@ -128,13 +128,13 @@ define i32 @reduce_i8(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 12844 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 41532 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V16 = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1304 for instruction: %V32 = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1952 for instruction: %V64 = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 3248 for instruction: %V128 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll index d77ed48b4a5b..62510ae70f2b 100644 --- a/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll +++ b/llvm/test/Analysis/CostModel/ARM/reduce-umin.ll @@ -21,11 +21,11 @@ define i32 @reduce_i64(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i64' -; MVE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 98 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 282 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 970 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 148 for instruction: %V2 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 288 for instruction: %V4 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 568 for instruction: %V8 = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1128 for instruction: %V16 = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> undef) @@ -54,11 +54,11 @@ define i32 @reduce_i32(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i32' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 696 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2504 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 208 for instruction: %V8 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 376 for instruction: %V16 = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 712 for instruction: %V32 = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> undef) @@ -89,12 +89,12 @@ define i32 @reduce_i16(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i16' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 1176 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 2976 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 10160 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 532 for instruction: %V16 = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 860 for instruction: %V32 = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1516 for instruction: %V64 = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> undef) @@ -128,13 +128,13 @@ define i32 @reduce_i8(i32 %arg) { ; NEON-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; MVE-LABEL: 'reduce_i8' -; MVE-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 76 for instruction: %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 140 for instruction: %V4 = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 788 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 4128 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 5668 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 12844 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) -; MVE-NEXT: Cost Model: Found an estimated cost of 41532 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 400 for instruction: %V8 = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1044 for instruction: %V16 = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1304 for instruction: %V32 = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 1952 for instruction: %V64 = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> undef) +; MVE-NEXT: Cost Model: Found an estimated cost of 3248 for instruction: %V128 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) ; MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/ARM/select.ll b/llvm/test/Analysis/CostModel/ARM/select.ll index 67a558003923..173904d9e60b 100644 --- a/llvm/test/Analysis/CostModel/ARM/select.ll +++ b/llvm/test/Analysis/CostModel/ARM/select.ll @@ -18,28 +18,28 @@ define void @selects() { ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 576 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef -; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef +; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef ; CHECK-MVE-RECIP-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-NEON-RECIP-LABEL: 'selects' diff --git a/llvm/test/Analysis/CostModel/ARM/shuffle.ll b/llvm/test/Analysis/CostModel/ARM/shuffle.ll index dde0a02bf473..65f7ea0393a1 100644 --- a/llvm/test/Analysis/CostModel/ARM/shuffle.ll +++ b/llvm/test/Analysis/CostModel/ARM/shuffle.ll @@ -4,19 +4,19 @@ define void @broadcast() { ; CHECK-MVE-LABEL: 'broadcast' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer @@ -24,8 +24,8 @@ define void @broadcast() { ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-NEON-LABEL: 'broadcast' @@ -88,28 +88,28 @@ define void @broadcast() { ;; Reverse shuffles should be lowered to vrev and possibly a vext (for quadwords, on neon) define void @reverse() { ; CHECK-MVE-LABEL: 'reverse' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-NEON-LABEL: 'reverse' @@ -233,28 +233,28 @@ define void @concat() { define void @select() { ; CHECK-MVE-LABEL: 'select' -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-NEON-LABEL: 'select' @@ -321,16 +321,16 @@ define void @vrev2() { ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-NEON-LABEL: 'vrev2' @@ -381,11 +381,11 @@ define void @vrev4() { ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 1024 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> -; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 256 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> +; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> ; CHECK-MVE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-NEON-LABEL: 'vrev4' diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll index d2961326ba71..6d7555ae59be 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-icmpcost.ll @@ -20,18 +20,18 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: LV: Scalar loop costs: 5. ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx = getelementptr inbounds i16, i16* %s, i32 %i.016 -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: %1 = load i16, i16* %arrayidx, align 2 +; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction: %1 = load i16, i16* %arrayidx, align 2 ; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction: %conv = sext i16 %1 to i32 -; CHECK: LV: Found an estimated cost of 12 for VF 2 For instruction: %cmp2 = icmp sgt i32 %conv, %conv1 +; CHECK: LV: Found an estimated cost of 20 for VF 2 For instruction: %cmp2 = icmp sgt i32 %conv, %conv1 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %cmp2, label %if.then, label %for.inc -; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: %conv6 = add i16 %1, %0 +; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction: %conv6 = add i16 %1, %0 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %arrayidx7 = getelementptr inbounds i16, i16* %d, i32 %i.016 ; CHECK: LV: Found an estimated cost of 16 for VF 2 For instruction: store i16 %conv6, i16* %arrayidx7, align 2 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br label %for.inc ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %inc = add nuw nsw i32 %i.016, 1 ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %exitcond.not = icmp eq i32 %inc, %n ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body -; CHECK: LV: Vector loop of width 2 costs: 29. +; CHECK: LV: Vector loop of width 2 costs: 43. ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %i.016 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ] ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %arrayidx = getelementptr inbounds i16, i16* %s, i32 %i.016 ; CHECK: LV: Found an estimated cost of 2 for VF 4 For instruction: %1 = load i16, i16* %arrayidx, align 2 @@ -50,7 +50,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %arrayidx = getelementptr inbounds i16, i16* %s, i32 %i.016 ; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %1 = load i16, i16* %arrayidx, align 2 ; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %conv = sext i16 %1 to i32 -; CHECK: LV: Found an estimated cost of 68 for VF 8 For instruction: %cmp2 = icmp sgt i32 %conv, %conv1 +; CHECK: LV: Found an estimated cost of 36 for VF 8 For instruction: %cmp2 = icmp sgt i32 %conv, %conv1 ; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %cmp2, label %if.then, label %for.inc ; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction: %conv6 = add i16 %1, %0 ; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: %arrayidx7 = getelementptr inbounds i16, i16* %d, i32 %i.016 @@ -59,7 +59,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %inc = add nuw nsw i32 %i.016, 1 ; CHECK: LV: Found an estimated cost of 1 for VF 8 For instruction: %exitcond.not = icmp eq i32 %inc, %n ; CHECK: LV: Found an estimated cost of 0 for VF 8 For instruction: br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body -; CHECK: LV: Vector loop of width 8 costs: 9. +; CHECK: LV: Vector loop of width 8 costs: 5. ; CHECK: LV: Selecting VF: 4. define void @expensive_icmp(i16* noalias nocapture %d, i16* nocapture readonly %s, i32 %n, i16 zeroext %m) #0 { entry: @@ -120,22 +120,22 @@ for.inc: ; preds = %for.body, %if.then ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %pSrcB.addr.09 = phi i8* [ %incdec.ptr2, %while.body ], [ %pSrcB, %while.body.preheader ] ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %incdec.ptr = getelementptr inbounds i8, i8* %pSrcA.addr.011, i32 1 -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: %0 = load i8, i8* %pSrcA.addr.011, align 1 +; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction: %0 = load i8, i8* %pSrcA.addr.011, align 1 ; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction: %conv1 = sext i8 %0 to i32 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %incdec.ptr2 = getelementptr inbounds i8, i8* %pSrcB.addr.09, i32 1 -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: %1 = load i8, i8* %pSrcB.addr.09, align 1 +; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction: %1 = load i8, i8* %pSrcB.addr.09, align 1 ; CHECK: LV: Found an estimated cost of 4 for VF 2 For instruction: %conv3 = sext i8 %1 to i32 -; CHECK: LV: Found an estimated cost of 14 for VF 2 For instruction: %mul = mul nsw i32 %conv3, %conv1 -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: %shr = ashr i32 %mul, 7 +; CHECK: LV: Found an estimated cost of 26 for VF 2 For instruction: %mul = mul nsw i32 %conv3, %conv1 +; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction: %shr = ashr i32 %mul, 7 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %2 = icmp slt i32 %shr, 127 -; CHECK: LV: Found an estimated cost of 24 for VF 2 For instruction: %spec.select.i = select i1 %2, i32 %shr, i32 127 +; CHECK: LV: Found an estimated cost of 40 for VF 2 For instruction: %spec.select.i = select i1 %2, i32 %shr, i32 127 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %conv4 = trunc i32 %spec.select.i to i8 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: %incdec.ptr5 = getelementptr inbounds i8, i8* %pDst.addr.010, i32 1 -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: store i8 %conv4, i8* %pDst.addr.010, align 1 +; CHECK: LV: Found an estimated cost of 18 for VF 2 For instruction: store i8 %conv4, i8* %pDst.addr.010, align 1 ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %dec = add i32 %blkCnt.012, -1 ; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cmp.not = icmp eq i32 %dec, 0 ; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction: br i1 %cmp.not, label %while.end.loopexit, label %while.body -; CHECK: LV: Vector loop of width 2 costs: 44. +; CHECK: LV: Vector loop of width 2 costs: 74. ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blockSize, %while.body.preheader ] ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %pSrcA.addr.011 = phi i8* [ %incdec.ptr, %while.body ], [ %pSrcA, %while.body.preheader ] ; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction: %pDst.addr.010 = phi i8* [ %incdec.ptr5, %while.body ], [ %pDst, %while.body.preheader ] @@ -238,8 +238,8 @@ while.end: ; preds = %while.end.loopexit, } ; CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction: %cmp1 = fcmp -; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction: %cmp1 = fcmp -; CHECK: LV: Found an estimated cost of 36 for VF 4 For instruction: %cmp1 = fcmp +; CHECK: LV: Found an estimated cost of 12 for VF 2 For instruction: %cmp1 = fcmp +; CHECK: LV: Found an estimated cost of 24 for VF 4 For instruction: %cmp1 = fcmp define void @floatcmp(float* nocapture readonly %pSrc, i32* nocapture %pDst, i32 %blockSize) #0 { entry: %cmp.not7 = icmp eq i32 %blockSize, 0 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll index f511a81c2915..006fc47e5c32 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-interleaved-cost.ll @@ -15,10 +15,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i8_factor_2" -; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 +; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_4-LABEL: Checking a loop in "i8_factor_2" ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i8, i8* %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp1, align 1 @@ -56,10 +56,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i16_factor_2" -; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 +; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_4-LABEL: Checking a loop in "i16_factor_2" ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2 @@ -97,10 +97,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i32_factor_2" -; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 +; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_2" ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4 @@ -138,25 +138,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i64_factor_2" -; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_2: Found an estimated cost of 44 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_4-LABEL: Checking a loop in "i64_factor_2" -; VF_4: Found an estimated cost of 80 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_4: Found an estimated cost of 88 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 48 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_8-LABEL: Checking a loop in "i64_factor_2" -; VF_8: Found an estimated cost of 288 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_8: Found an estimated cost of 176 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 160 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_16-LABEL: Checking a loop in "i64_factor_2" -; VF_16: Found an estimated cost of 1088 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 +; VF_16: Found an estimated cost of 352 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 576 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0 @@ -179,15 +179,15 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f16_factor_2" -; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load half, half* %tmp0, align 2 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_4-LABEL: Checking a loop in "f16_factor_2" -; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2 +; VF_4: Found an estimated cost of 18 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 -; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 +; VF_4-NEXT: Found an estimated cost of 16 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_8-LABEL: Checking a loop in "f16_factor_2" ; VF_8: Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2 @@ -220,10 +220,10 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f32_factor_2" -; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load float, float* %tmp0, align 4 +; VF_2: Found an estimated cost of 10 for VF 2 For instruction: %tmp2 = load float, float* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load float, float* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_4-LABEL: Checking a loop in "f32_factor_2" ; VF_4: Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load float, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load float, float* %tmp1, align 4 @@ -261,25 +261,25 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f64_factor_2" -; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp2 = load double, double* %tmp0, align 8 +; VF_2: Found an estimated cost of 12 for VF 2 For instruction: %tmp2 = load double, double* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load double, double* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8 -; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8 +; VF_2-NEXT: Found an estimated cost of 8 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_4-LABEL: Checking a loop in "f64_factor_2" -; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp2 = load double, double* %tmp0, align 8 +; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp2 = load double, double* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load double, double* %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8 -; VF_4-NEXT: Found an estimated cost of 40 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8 +; VF_4-NEXT: Found an estimated cost of 16 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_8-LABEL: Checking a loop in "f64_factor_2" -; VF_8: Found an estimated cost of 272 for VF 8 For instruction: %tmp2 = load double, double* %tmp0, align 8 +; VF_8: Found an estimated cost of 48 for VF 8 For instruction: %tmp2 = load double, double* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load double, double* %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8 -; VF_8-NEXT: Found an estimated cost of 144 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8 +; VF_8-NEXT: Found an estimated cost of 32 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_16-LABEL: Checking a loop in "f64_factor_2" -; VF_16: Found an estimated cost of 1056 for VF 16 For instruction: %tmp2 = load double, double* %tmp0, align 8 +; VF_16: Found an estimated cost of 96 for VF 16 For instruction: %tmp2 = load double, double* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load double, double* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8 -; VF_16-NEXT: Found an estimated cost of 544 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8 +; VF_16-NEXT: Found an estimated cost of 64 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.2, %f64.2* %data, i64 %i, i32 0 @@ -306,33 +306,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i8_factor_3" -; VF_2: Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp0, align 1 +; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i8, i8* %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp1, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, i8* %tmp2, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1 ; VF_4-LABEL: Checking a loop in "i8_factor_3" -; VF_4: Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp0, align 1 +; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i8, i8* %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp1, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, i8* %tmp2, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1 -; VF_4-NEXT: Found an estimated cost of 60 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1 ; VF_8-LABEL: Checking a loop in "i8_factor_3" -; VF_8: Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp0, align 1 +; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, i8* %tmp2, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 -; VF_8-NEXT: Found an estimated cost of 216 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1 ; VF_16-LABEL: Checking a loop in "i8_factor_3" -; VF_16: Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp0, align 1 +; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 -; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.3, %i8.3* %data, i64 %i, i32 0 @@ -358,33 +358,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i16_factor_3" -; VF_2: Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp0, align 2 +; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i16, i16* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, i16* %tmp2, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2 ; VF_4-LABEL: Checking a loop in "i16_factor_3" -; VF_4: Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp0, align 2 +; VF_4: Found an estimated cost of 72 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, i16* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 60 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2 ; VF_8-LABEL: Checking a loop in "i16_factor_3" -; VF_8: Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp0, align 2 +; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 216 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2 ; VF_16-LABEL: Checking a loop in "i16_factor_3" -; VF_16: Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp0, align 2 +; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.3, %i16.3* %data, i64 %i, i32 0 @@ -410,12 +410,12 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i32_factor_3" -; VF_2: Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp0, align 4 +; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i32, i32* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, i32* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_3" ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp1, align 4 @@ -424,19 +424,19 @@ entry: ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_8-LABEL: Checking a loop in "i32_factor_3" -; VF_8: Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp0, align 4 +; VF_8: Found an estimated cost of 144 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 216 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_16-LABEL: Checking a loop in "i32_factor_3" -; VF_16: Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp0, align 4 +; VF_16: Found an estimated cost of 288 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.3, %i32.3* %data, i64 %i, i32 0 @@ -462,33 +462,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i64_factor_3" -; VF_2: Found an estimated cost of 36 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp0, align 8 +; VF_2: Found an estimated cost of 66 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, i64* %tmp2, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8 ; VF_4-LABEL: Checking a loop in "i64_factor_3" -; VF_4: Found an estimated cost of 120 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp0, align 8 +; VF_4: Found an estimated cost of 132 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, i64* %tmp2, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 72 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 36 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8 ; VF_8-LABEL: Checking a loop in "i64_factor_3" -; VF_8: Found an estimated cost of 432 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp0, align 8 +; VF_8: Found an estimated cost of 264 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, i64* %tmp2, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 240 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 72 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8 ; VF_16-LABEL: Checking a loop in "i64_factor_3" -; VF_16: Found an estimated cost of 1632 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp0, align 8 +; VF_16: Found an estimated cost of 528 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, i64* %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 864 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 144 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.3, %i64.3* %data, i64 %i, i32 0 @@ -514,33 +514,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f16_factor_3" -; VF_2: Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load half, half* %tmp0, align 2 +; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load half, half* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load half, half* %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, half* %tmp2, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2 ; VF_4-LABEL: Checking a loop in "f16_factor_3" -; VF_4: Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load half, half* %tmp0, align 2 +; VF_4: Found an estimated cost of 28 for VF 4 For instruction: %tmp3 = load half, half* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load half, half* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, half* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 -; VF_4-NEXT: Found an estimated cost of 60 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2 ; VF_8-LABEL: Checking a loop in "f16_factor_3" -; VF_8: Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load half, half* %tmp0, align 2 +; VF_8: Found an estimated cost of 56 for VF 8 For instruction: %tmp3 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 -; VF_8-NEXT: Found an estimated cost of 216 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2 ; VF_16-LABEL: Checking a loop in "f16_factor_3" -; VF_16: Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load half, half* %tmp0, align 2 +; VF_16: Found an estimated cost of 112 for VF 16 For instruction: %tmp3 = load half, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load half, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 -; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.3, %f16.3* %data, i64 %i, i32 0 @@ -566,12 +566,12 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f32_factor_3" -; VF_2: Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load float, float* %tmp0, align 4 +; VF_2: Found an estimated cost of 16 for VF 2 For instruction: %tmp3 = load float, float* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load float, float* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, float* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4 ; VF_4-LABEL: Checking a loop in "f32_factor_3" ; VF_4: Found an estimated cost of 24 for VF 4 For instruction: %tmp3 = load float, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load float, float* %tmp1, align 4 @@ -580,19 +580,19 @@ entry: ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4 ; VF_8-LABEL: Checking a loop in "f32_factor_3" -; VF_8: Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load float, float* %tmp0, align 4 +; VF_8: Found an estimated cost of 64 for VF 8 For instruction: %tmp3 = load float, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load float, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4 -; VF_8-NEXT: Found an estimated cost of 216 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4 ; VF_16-LABEL: Checking a loop in "f32_factor_3" -; VF_16: Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load float, float* %tmp0, align 4 +; VF_16: Found an estimated cost of 128 for VF 16 For instruction: %tmp3 = load float, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load float, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 -; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.3, %f32.3* %data, i64 %i, i32 0 @@ -618,33 +618,33 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f64_factor_3" -; VF_2: Found an estimated cost of 30 for VF 2 For instruction: %tmp3 = load double, double* %tmp0, align 8 +; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp3 = load double, double* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp4 = load double, double* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, double* %tmp2, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8 -; VF_2-NEXT: Found an estimated cost of 18 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8 +; VF_2-NEXT: Found an estimated cost of 12 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8 ; VF_4-LABEL: Checking a loop in "f64_factor_3" -; VF_4: Found an estimated cost of 108 for VF 4 For instruction: %tmp3 = load double, double* %tmp0, align 8 +; VF_4: Found an estimated cost of 36 for VF 4 For instruction: %tmp3 = load double, double* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp4 = load double, double* %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, double* %tmp2, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8 -; VF_4-NEXT: Found an estimated cost of 60 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8 +; VF_4-NEXT: Found an estimated cost of 24 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8 ; VF_8-LABEL: Checking a loop in "f64_factor_3" -; VF_8: Found an estimated cost of 408 for VF 8 For instruction: %tmp3 = load double, double* %tmp0, align 8 +; VF_8: Found an estimated cost of 72 for VF 8 For instruction: %tmp3 = load double, double* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp4 = load double, double* %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, double* %tmp2, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8 -; VF_8-NEXT: Found an estimated cost of 216 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8 +; VF_8-NEXT: Found an estimated cost of 48 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8 ; VF_16-LABEL: Checking a loop in "f64_factor_3" -; VF_16: Found an estimated cost of 1584 for VF 16 For instruction: %tmp3 = load double, double* %tmp0, align 8 +; VF_16: Found an estimated cost of 144 for VF 16 For instruction: %tmp3 = load double, double* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp4 = load double, double* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, double* %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8 -; VF_16-NEXT: Found an estimated cost of 816 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8 +; VF_16-NEXT: Found an estimated cost of 96 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.3, %f64.3* %data, i64 %i, i32 0 @@ -673,41 +673,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i8_factor_4" -; VF_2: Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 +; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i8, i8* %tmp1, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i8, i8* %tmp2, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i8, i8* %tmp3, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i8 0, i8* %tmp2, align 1 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i8 0, i8* %tmp3, align 1 +; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store i8 0, i8* %tmp3, align 1 ; VF_4-LABEL: Checking a loop in "i8_factor_4" -; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 +; VF_4: Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i8, i8* %tmp1, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i8, i8* %tmp2, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i8, i8* %tmp3, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i8 0, i8* %tmp2, align 1 -; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i8 0, i8* %tmp3, align 1 +; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store i8 0, i8* %tmp3, align 1 ; VF_8-LABEL: Checking a loop in "i8_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 +; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i8, i8* %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i8, i8* %tmp2, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i8, i8* %tmp3, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp2, align 1 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store i8 0, i8* %tmp3, align 1 ; VF_16-LABEL: Checking a loop in "i8_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 +; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i8, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i8, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i8, i8* %tmp2, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i8, i8* %tmp3, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp2, align 1 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 +; VF_16-NEXT: Found an estimated cost of 128 for VF 16 For instruction: store i8 0, i8* %tmp3, align 1 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i8.4, %i8.4* %data, i64 %i, i32 0 @@ -736,41 +736,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i16_factor_4" -; VF_2: Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_4-LABEL: Checking a loop in "i16_factor_4" -; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_4: Found an estimated cost of 96 for VF 4 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "i16_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store i16 0, i16* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "i16_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 +; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i16, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i16, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i16, i16* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i16, i16* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 128 for VF 16 For instruction: store i16 0, i16* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i16.4, %i16.4* %data, i64 %i, i32 0 @@ -799,14 +799,14 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i32_factor_4" -; VF_2: Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "i32_factor_4" ; VF_4: Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 @@ -817,23 +817,23 @@ entry: ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "i32_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_8: Found an estimated cost of 192 for VF 8 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store i32 0, i32* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "i32_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 +; VF_16: Found an estimated cost of 384 for VF 16 For instruction: %tmp4 = load i32, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i32, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i32, i32* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i32, i32* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 128 for VF 16 For instruction: store i32 0, i32* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i32.4, %i32.4* %data, i64 %i, i32 0 @@ -862,41 +862,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "i64_factor_4" -; VF_2: Found an estimated cost of 48 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp0, align 8 +; VF_2: Found an estimated cost of 88 for VF 2 For instruction: %tmp4 = load i64, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load i64, i64* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load i64, i64* %tmp2, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load i64, i64* %tmp3, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 32 for VF 2 For instruction: store i64 0, i64* %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store i64 0, i64* %tmp3, align 8 ; VF_4-LABEL: Checking a loop in "i64_factor_4" -; VF_4: Found an estimated cost of 160 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp0, align 8 +; VF_4: Found an estimated cost of 176 for VF 4 For instruction: %tmp4 = load i64, i64* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load i64, i64* %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load i64, i64* %tmp2, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load i64, i64* %tmp3, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 96 for VF 4 For instruction: store i64 0, i64* %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 48 for VF 4 For instruction: store i64 0, i64* %tmp3, align 8 ; VF_8-LABEL: Checking a loop in "i64_factor_4" -; VF_8: Found an estimated cost of 576 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp0, align 8 +; VF_8: Found an estimated cost of 352 for VF 8 For instruction: %tmp4 = load i64, i64* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load i64, i64* %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load i64, i64* %tmp2, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load i64, i64* %tmp3, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 320 for VF 8 For instruction: store i64 0, i64* %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 96 for VF 8 For instruction: store i64 0, i64* %tmp3, align 8 ; VF_16-LABEL: Checking a loop in "i64_factor_4" -; VF_16: Found an estimated cost of 2176 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp0, align 8 +; VF_16: Found an estimated cost of 704 for VF 16 For instruction: %tmp4 = load i64, i64* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load i64, i64* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load i64, i64* %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load i64, i64* %tmp3, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 1152 for VF 16 For instruction: store i64 0, i64* %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 192 for VF 16 For instruction: store i64 0, i64* %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %i64.4, %i64.4* %data, i64 %i, i32 0 @@ -925,41 +925,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f16_factor_4" -; VF_2: Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_2: Found an estimated cost of 18 for VF 2 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_4-LABEL: Checking a loop in "f16_factor_4" -; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_4: Found an estimated cost of 36 for VF 4 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_8-LABEL: Checking a loop in "f16_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_8: Found an estimated cost of 72 for VF 8 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp3, align 2 ; VF_16-LABEL: Checking a loop in "f16_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2 +; VF_16: Found an estimated cost of 144 for VF 16 For instruction: %tmp4 = load half, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load half, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load half, half* %tmp2, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load half, half* %tmp3, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp0, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp1, align 2 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store half 0xH0000, half* %tmp2, align 2 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 +; VF_16-NEXT: Found an estimated cost of 128 for VF 16 For instruction: store half 0xH0000, half* %tmp3, align 2 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f16.4, %f16.4* %data, i64 %i, i32 0 @@ -988,14 +988,14 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f32_factor_4" -; VF_2: Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_2: Found an estimated cost of 20 for VF 2 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_4-LABEL: Checking a loop in "f32_factor_4" ; VF_4: Found an estimated cost of 32 for VF 4 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load float, float* %tmp1, align 4 @@ -1006,23 +1006,23 @@ entry: ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store float 0.000000e+00, float* %tmp2, align 4 ; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_8-LABEL: Checking a loop in "f32_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_8: Found an estimated cost of 80 for VF 8 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store float 0.000000e+00, float* %tmp3, align 4 ; VF_16-LABEL: Checking a loop in "f32_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4 +; VF_16: Found an estimated cost of 160 for VF 16 For instruction: %tmp4 = load float, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load float, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load float, float* %tmp2, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load float, float* %tmp3, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp0, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp1, align 4 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store float 0.000000e+00, float* %tmp2, align 4 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 +; VF_16-NEXT: Found an estimated cost of 128 for VF 16 For instruction: store float 0.000000e+00, float* %tmp3, align 4 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f32.4, %f32.4* %data, i64 %i, i32 0 @@ -1051,41 +1051,41 @@ entry: br label %for.body ; VF_2-LABEL: Checking a loop in "f64_factor_4" -; VF_2: Found an estimated cost of 40 for VF 2 For instruction: %tmp4 = load double, double* %tmp0, align 8 +; VF_2: Found an estimated cost of 24 for VF 2 For instruction: %tmp4 = load double, double* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp5 = load double, double* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp6 = load double, double* %tmp2, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: %tmp7 = load double, double* %tmp3, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_2-NEXT: Found an estimated cost of 0 for VF 2 For instruction: store double 0.000000e+00, double* %tmp2, align 8 -; VF_2-NEXT: Found an estimated cost of 24 for VF 2 For instruction: store double 0.000000e+00, double* %tmp3, align 8 +; VF_2-NEXT: Found an estimated cost of 16 for VF 2 For instruction: store double 0.000000e+00, double* %tmp3, align 8 ; VF_4-LABEL: Checking a loop in "f64_factor_4" -; VF_4: Found an estimated cost of 144 for VF 4 For instruction: %tmp4 = load double, double* %tmp0, align 8 +; VF_4: Found an estimated cost of 48 for VF 4 For instruction: %tmp4 = load double, double* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp5 = load double, double* %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp6 = load double, double* %tmp2, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: %tmp7 = load double, double* %tmp3, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_4-NEXT: Found an estimated cost of 0 for VF 4 For instruction: store double 0.000000e+00, double* %tmp2, align 8 -; VF_4-NEXT: Found an estimated cost of 80 for VF 4 For instruction: store double 0.000000e+00, double* %tmp3, align 8 +; VF_4-NEXT: Found an estimated cost of 32 for VF 4 For instruction: store double 0.000000e+00, double* %tmp3, align 8 ; VF_8-LABEL: Checking a loop in "f64_factor_4" -; VF_8: Found an estimated cost of 544 for VF 8 For instruction: %tmp4 = load double, double* %tmp0, align 8 +; VF_8: Found an estimated cost of 96 for VF 8 For instruction: %tmp4 = load double, double* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp5 = load double, double* %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp6 = load double, double* %tmp2, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: %tmp7 = load double, double* %tmp3, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_8-NEXT: Found an estimated cost of 0 for VF 8 For instruction: store double 0.000000e+00, double* %tmp2, align 8 -; VF_8-NEXT: Found an estimated cost of 288 for VF 8 For instruction: store double 0.000000e+00, double* %tmp3, align 8 +; VF_8-NEXT: Found an estimated cost of 64 for VF 8 For instruction: store double 0.000000e+00, double* %tmp3, align 8 ; VF_16-LABEL: Checking a loop in "f64_factor_4" -; VF_16: Found an estimated cost of 2112 for VF 16 For instruction: %tmp4 = load double, double* %tmp0, align 8 +; VF_16: Found an estimated cost of 192 for VF 16 For instruction: %tmp4 = load double, double* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp5 = load double, double* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp6 = load double, double* %tmp2, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: %tmp7 = load double, double* %tmp3, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp0, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp1, align 8 ; VF_16-NEXT: Found an estimated cost of 0 for VF 16 For instruction: store double 0.000000e+00, double* %tmp2, align 8 -; VF_16-NEXT: Found an estimated cost of 1088 for VF 16 For instruction: store double 0.000000e+00, double* %tmp3, align 8 +; VF_16-NEXT: Found an estimated cost of 128 for VF 16 For instruction: store double 0.000000e+00, double* %tmp3, align 8 for.body: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ] %tmp0 = getelementptr inbounds %f64.4, %f64.4* %data, i64 %i, i32 0 diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll index 3356db935eea..30bc06a7a0b1 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-saddsatcost.ll @@ -8,7 +8,7 @@ target triple = "thumbv8.1m.main-arm-none-eabi" ; CHECK-COST-LABEL: arm_offset_q15 ; CHECK-COST: Found an estimated cost of 10 for VF 1 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) -; CHECK-COST: Found an estimated cost of 28 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) +; CHECK-COST: Found an estimated cost of 36 for VF 2 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) ; CHECK-COST: Found an estimated cost of 8 for VF 4 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) ; CHECK-COST: Found an estimated cost of 2 for VF 8 For instruction: %1 = tail call i16 @llvm.sadd.sat.i16(i16 %0, i16 %offset) diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll index 730e26af0bb2..59422cb07d6a 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll @@ -688,19 +688,37 @@ end: define hidden void @pointer_phi_v4half_add3(half* noalias nocapture readonly %A, half* noalias nocapture %B, half %y) { ; CHECK-LABEL: @pointer_phi_v4half_add3( ; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr half, half* [[A:%.*]], i32 2976 +; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr half, half* [[B:%.*]], i32 992 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x half> poison, half [[Y:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x half> [[BROADCAST_SPLATINSERT]], <8 x half> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[INDEX]], 3 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr half, half* [[A]], i32 [[TMP0]] +; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr half, half* [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast half* [[NEXT_GEP]] to <24 x half>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x half>, <24 x half>* [[TMP1]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x half> [[WIDE_VEC]], <24 x half> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <8 x half> [[STRIDED_VEC]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast half* [[NEXT_GEP4]] to <8 x half>* +; CHECK-NEXT: store <8 x half> [[TMP2]], <8 x half>* [[TMP3]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 +; CHECK-NEXT: br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]] ; CHECK: for.body: -; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi half* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] -; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi half* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = load half, half* [[A_ADDR_09]], align 4 +; CHECK-NEXT: [[A_ADDR_09:%.*]] = phi half* [ [[ADD_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ] +; CHECK-NEXT: [[B_ADDR_07:%.*]] = phi half* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END3]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = load half, half* [[A_ADDR_09]], align 4 ; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds half, half* [[A_ADDR_09]], i32 3 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP0]], [[Y:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = fadd fast half [[TMP5]], [[Y]] ; CHECK-NEXT: store half [[ADD]], half* [[B_ADDR_07]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds half, half* [[B_ADDR_07]], i32 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], [[LOOP23:!llvm.loop !.*]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -753,7 +771,7 @@ define hidden void @pointer_phi_v4i32_uf2(i32* noalias nocapture readonly %A, i3 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9992 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 48 -; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], [[LOOP22:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: @@ -767,7 +785,7 @@ define hidden void @pointer_phi_v4i32_uf2(i32* noalias nocapture readonly %A, i3 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_06]], i32 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], [[LOOP23:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], [[LOOP25:!llvm.loop !.*]] ; entry: @@ -837,7 +855,7 @@ define hidden void @pointer_phi_v4i32_uf4(i32* noalias nocapture readonly %A, i3 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 9984 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i32, i32* [[POINTER_PHI]], i32 96 -; CHECK-NEXT: br i1 [[TMP15]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], [[LOOP24:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP15]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], [[LOOP26:!llvm.loop !.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: @@ -851,7 +869,7 @@ define hidden void @pointer_phi_v4i32_uf4(i32* noalias nocapture readonly %A, i3 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i32, i32* [[B_ADDR_06]], i32 1 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], [[LOOP25:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], [[LOOP27:!llvm.loop !.*]] ; entry: br label %for.body @@ -893,23 +911,23 @@ define hidden void @mult_ptr_iv(i8* noalias nocapture readonly %x, i8* noalias n ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[POINTER_PHI5]], <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP0]], i32 1 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP0]], i32 1, <4 x i1> , <4 x i8> undef), !alias.scope !26 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP0]], i32 1, <4 x i1> , <4 x i8> undef), !alias.scope !28 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP0]], i32 2 -; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP2]], i32 1, <4 x i1> , <4 x i8> undef), !alias.scope !26 -; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP3]], i32 1, <4 x i1> , <4 x i8> undef), !alias.scope !26 +; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP2]], i32 1, <4 x i1> , <4 x i8> undef), !alias.scope !28 +; CHECK-NEXT: [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> [[TMP3]], i32 1, <4 x i1> , <4 x i8> undef), !alias.scope !28 ; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]] ; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER8]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP1]], i32 1 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP4]], <4 x i8*> [[TMP1]], i32 1, <4 x i1> ), !alias.scope !29, !noalias !26 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP4]], <4 x i8*> [[TMP1]], i32 1, <4 x i1> ), !alias.scope !31, !noalias !28 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP1]], i32 2 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP5]], <4 x i8*> [[TMP7]], i32 1, <4 x i1> ), !alias.scope !29, !noalias !26 -; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP6]], <4 x i8*> [[TMP8]], i32 1, <4 x i1> ), !alias.scope !29, !noalias !26 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP5]], <4 x i8*> [[TMP7]], i32 1, <4 x i1> ), !alias.scope !31, !noalias !28 +; CHECK-NEXT: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> [[TMP6]], <4 x i8*> [[TMP8]], i32 1, <4 x i1> ), !alias.scope !31, !noalias !28 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i32 12 ; CHECK-NEXT: [[PTR_IND6]] = getelementptr i8, i8* [[POINTER_PHI5]], i32 12 -; CHECK-NEXT: br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], [[LOOP31:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP9]], label [[END:%.*]], label [[VECTOR_BODY]], [[LOOP33:!llvm.loop !.*]] ; CHECK: for.body: ; CHECK-NEXT: [[X_ADDR_050:%.*]] = phi i8* [ [[INCDEC_PTR2:%.*]], [[FOR_BODY]] ], [ [[X]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[Z_ADDR_049:%.*]] = phi i8* [ [[INCDEC_PTR34:%.*]], [[FOR_BODY]] ], [ [[Z]], [[ENTRY]] ] @@ -931,7 +949,7 @@ define hidden void @mult_ptr_iv(i8* noalias nocapture readonly %x, i8* noalias n ; CHECK-NEXT: store i8 [[MUL2]], i8* [[INCDEC_PTR33]], align 1 ; CHECK-NEXT: [[INC]] = add nuw i32 [[I_048]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[FOR_BODY]], [[LOOP32:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[END]], label [[FOR_BODY]], [[LOOP34:!llvm.loop !.*]] ; CHECK: end: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll index b67475f19fd1..7e33b331aaa8 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-not-allowed.ll @@ -465,25 +465,58 @@ for.body: define void @fptrunc_not_allowed(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C, half* noalias nocapture %D) #0 { ; CHECK-LABEL: @fptrunc_not_allowed( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>* +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = fptrunc <4 x float> [[TMP7]] to <4 x half> +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x half> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds half, half* [[D:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds half, half* [[TMP13]], i32 0 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast half* [[TMP14]] to <4 x half>* +; CHECK-NEXT: store <4 x half> [[TMP12]], <4 x half>* [[TMP15]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 428 +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 431, 428 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[I_017:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i32 [[I_017]] -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[C:%.*]], i32 [[I_017]] -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i32 [[I_017]] +; CHECK-NEXT: [[I_017:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i32 [[I_017]] +; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[C]], i32 [[I_017]] +; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP18]], [[TMP17]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i32 [[I_017]] ; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[CONV:%.*]] = fptrunc float [[ADD]] to half ; CHECK-NEXT: [[FACTOR:%.*]] = fmul fast half [[CONV]], 0xH4000 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half* [[D:%.*]], i32 [[I_017]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds half, half* [[D]], i32 [[I_017]] ; CHECK-NEXT: store half [[FACTOR]], half* [[ARRAYIDX5]], align 2 ; CHECK-NEXT: [[ADD6]] = add nuw nsw i32 [[I_017]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[ADD6]], 431 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP13:!llvm.loop !.*]] ; entry: br label %for.body @@ -591,7 +624,7 @@ define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] @@ -609,7 +642,7 @@ define i32 @i32_smin_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP13:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -661,7 +694,7 @@ define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] @@ -679,7 +712,7 @@ define i32 @i32_smax_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -731,7 +764,7 @@ define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] @@ -749,7 +782,7 @@ define i32 @i32_umin_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP19:!llvm.loop !.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] @@ -801,7 +834,7 @@ define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] @@ -819,7 +852,7 @@ define i32 @i32_umax_reduction(i32* nocapture readonly %x, i32 %n) #0 { ; CHECK-NEXT: [[ADD]] = select i1 [[C]], i32 [[R_07]], i32 [[TMP8]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP19:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP21:!llvm.loop !.*]] ; CHECK: for.cond.cleanup.loopexit: ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -- GitLab From 1b7498120d2f24fac2fabdc167c268a4cd78cec7 Mon Sep 17 00:00:00 2001 From: Rob Suderman Date: Fri, 19 Mar 2021 11:24:52 -0700 Subject: [PATCH 0184/1000] [mlir][tosa] Add tosa.logical_* to linalg lowerings Adds lowerings for logical_* boolean operations. Each of these ops only operate on booleans allowing simple lowerings. Reviewed By: NatashaKnk Differential Revision: https://reviews.llvm.org/D98910 --- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 25 ++++++++++++++++++- .../TosaToLinalg/tosa-to-linalg.mlir | 24 ++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 5db47b423d89..903e4cc765aa 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -149,10 +149,29 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, if (isa(op) && elementTy.isa()) return rewriter.create(loc, resultTypes, args); - // tosa::LogicalrightShiftOp + // tosa::LogicalRightShiftOp if (isa(op) && elementTy.isa()) return rewriter.create(loc, resultTypes, args); + // tosa::LogicalAnd + if (isa(op) && elementTy.isInteger(1)) + return rewriter.create(loc, resultTypes, args); + + // tosa::LogicalNot + if (isa(op) && elementTy.isInteger(1)) { + auto one = rewriter.create( + loc, rewriter.getIntegerAttr(elementTy, 1)); + return rewriter.create(loc, resultTypes, args[0], one); + } + + // tosa::LogicalOr + if (isa(op) && elementTy.isInteger(1)) + return rewriter.create(loc, resultTypes, args); + + // tosa::LogicalXor + if (isa(op) && elementTy.isInteger(1)) + return rewriter.create(loc, resultTypes, args); + // tosa::PowOp if (isa(op) && elementTy.isa()) return rewriter.create(loc, resultTypes, args); @@ -869,6 +888,10 @@ void mlir::tosa::populateTosaToLinalgOnTensorsConversionPatterns( PointwiseConverter, PointwiseConverter, PointwiseConverter, + PointwiseConverter, + PointwiseConverter, + PointwiseConverter, + PointwiseConverter, PointwiseConverter, PointwiseConverter, PointwiseConverter, PointwiseConverter, diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index 1714f140dbfc..6f99d782d3af 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -260,6 +260,30 @@ func @test_simple_i32(%arg0: tensor<1xi32>) -> () { // ----- +// CHECK-LABEL: @test_bool +func @test_bool(%arg0: tensor<1xi1>, %arg1: tensor<1xi1>) -> () { + // CHECK: linalg.generic + // CHECK: and + %0 = "tosa.logical_and"(%arg0, %arg1) : (tensor<1xi1>, tensor<1xi1>) -> tensor<1xi1> + + // CHECK: linalg.generic + // CHECK: or + %1 = "tosa.logical_or"(%arg0, %arg1) : (tensor<1xi1>, tensor<1xi1>) -> tensor<1xi1> + + // CHECK: linalg.generic + // CHECK: xor + %2 = "tosa.logical_xor"(%arg0, %arg1) : (tensor<1xi1>, tensor<1xi1>) -> tensor<1xi1> + + // CHECK: linalg.generic + // CHECK: constant true + // CHECK: xor + %3 = "tosa.logical_not"(%arg0) : (tensor<1xi1>) -> tensor<1xi1> + + return +} + +// ----- + // CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d0, d1)> // CHECK-LABEL: @test_reshape_downrank func @test_reshape_downrank(%arg0: tensor<2x3xf32>) -> tensor<6xf32> { -- GitLab From 47286fc530159dfdbc28f14daaeff4066a1f3b1e Mon Sep 17 00:00:00 2001 From: Rob Suderman Date: Fri, 19 Mar 2021 11:42:22 -0700 Subject: [PATCH 0185/1000] [mlir][tosa] Add tosa.cast to linalg lowering Handles lowering from the tosa CastOp to the equivalent linalg lowering. It includes support for interchange between bool, int, and floating point. Reviewed By: antiagainst Differential Revision: https://reviews.llvm.org/D98828 --- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 63 ++++++++++++++++++- .../TosaToLinalg/tosa-to-linalg.mlir | 50 +++++++++++++++ 2 files changed, 112 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 903e4cc765aa..72b9aa850213 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -289,6 +289,67 @@ createLinalgBodyCalculationForElementwiseOp(Operation *op, ValueRange args, rewriter); } + // tosa::CastOp + if (isa(op)) { + Type srcTy = elementTy; + Type dstTy = resultTypes.front(); + bool bitExtend = + srcTy.getIntOrFloatBitWidth() < dstTy.getIntOrFloatBitWidth(); + + if (srcTy == dstTy) + return args.front(); + + if (srcTy.isa() && dstTy.isa() && bitExtend) + return rewriter.create(loc, resultTypes, args, mlir::None); + + if (srcTy.isa() && dstTy.isa() && !bitExtend) + return rewriter.create(loc, resultTypes, args, + mlir::None); + + // 1-bit integers need to be treated as signless. + if (srcTy.isInteger(1) && mlir::UIToFPOp::areCastCompatible(srcTy, dstTy)) + return rewriter.create(loc, resultTypes, args, + mlir::None); + + if (srcTy.isInteger(1) && dstTy.isa() && bitExtend) + return rewriter.create(loc, resultTypes, args, + mlir::None); + + // All other si-to-fp conversions should be handled by SIToFP. + if (mlir::SIToFPOp::areCastCompatible(srcTy, dstTy)) + return rewriter.create(loc, resultTypes, args, + mlir::None); + + // Casting to boolean, floats need to only be checked as not-equal to zero. + if (srcTy.isa() && dstTy.isInteger(1)) { + Value zero = + rewriter.create(loc, rewriter.getFloatAttr(srcTy, 0.0)); + return rewriter.create(loc, CmpFPredicate::UNE, + args.front(), zero); + } + + if (mlir::FPToSIOp::areCastCompatible(srcTy, dstTy)) + return rewriter.create(loc, resultTypes, args, + mlir::None); + + // Casting to boolean, integers need to only be checked as not-equal to + // zero. + if (srcTy.isa() && dstTy.isInteger(1)) { + Value zero = + rewriter.create(loc, 0, srcTy.getIntOrFloatBitWidth()); + return rewriter.create(loc, CmpIPredicate::ne, args.front(), + zero); + } + + if (srcTy.isa() && dstTy.isa() && bitExtend) + return rewriter.create(loc, resultTypes, args, + mlir::None); + + if (srcTy.isa() && dstTy.isa() && !bitExtend) + return rewriter.create(loc, resultTypes, args, + mlir::None); + } + (void)rewriter.notifyMatchFailure( op, "unhandled op for linalg body calculation for elementwise op"); return nullptr; @@ -891,7 +952,7 @@ void mlir::tosa::populateTosaToLinalgOnTensorsConversionPatterns( PointwiseConverter, PointwiseConverter, PointwiseConverter, - PointwiseConverter, + PointwiseConverter, PointwiseConverter, PointwiseConverter, PointwiseConverter, PointwiseConverter, PointwiseConverter, diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index 6f99d782d3af..f25eb3f346ba 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -180,6 +180,35 @@ func @test_simple_f32(%arg0: tensor<1xf32>) -> () { // CHECK: select %18 = "tosa.reluN"(%0) {max_int = 5 : i64, max_fp = 5.0 : f32} : (tensor<1xf32>) -> tensor<1xf32> + // CHECK: linalg.generic + // CHECK: fptosi + %19 = "tosa.cast"(%0) : (tensor<1xf32>) -> tensor<1xi32> + + // CHECK: linalg.generic + // CHECK: constant 0 + // CHECK: cmpf + %20 = "tosa.cast"(%0) : (tensor<1xf32>) -> tensor<1xi1> + + // CHECK: linalg.generic + // CHECK: fptrunc + %21 = "tosa.cast"(%0) : (tensor<1xf32>) -> tensor<1xf16> + + // CHECK: linalg.generic + // CHECK: yield + %22 = "tosa.cast"(%0) : (tensor<1xf32>) -> tensor<1xf32> + + return +} + +// ----- + +// CHECK-LABEL: @test_simple_f16 +func @test_simple_f16(%arg0: tensor<1xf16>) -> () { + + // CHECK: linalg.generic + // CHECK: fpext + %0 = "tosa.cast"(%arg0) : (tensor<1xf16>) -> tensor<1xf32> + return } @@ -255,6 +284,27 @@ func @test_simple_i32(%arg0: tensor<1xi32>) -> () { // CHECK: select %15 = "tosa.reluN"(%0) {max_int = 5 : i64, max_fp = 5.0 : f32} : (tensor<1xi32>) -> tensor<1xi32> + // CHECK: linalg.generic + // CHECK: trunci + %16 = "tosa.cast"(%0) : (tensor<1xi32>) -> tensor<1xi16> + + // CHECK: linalg.generic + // CHECK: yield + %17 = "tosa.cast"(%0) : (tensor<1xi32>) -> tensor<1xi32> + + // CHECK: linalg.generic + // CHECK: sexti + %18 = "tosa.cast"(%0) : (tensor<1xi32>) -> tensor<1xi64> + + // CHECK: linalg.generic + // CHECK: constant 0 + // CHECK: cmpi + %19 = "tosa.cast"(%0) : (tensor<1xi32>) -> tensor<1xi1> + + // CHECK: linalg.generic + // CHECK: sitofp + %20 = "tosa.cast"(%0) : (tensor<1xi32>) -> tensor<1xf32> + return } -- GitLab From 1066dcb5503006acd193b9d2793e065a1098e0e3 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 19 Mar 2021 11:23:27 -0700 Subject: [PATCH 0186/1000] [AArch64] Fix LowerMGATHER to return the chain result for floating point gathers. Found by adding asserts to LegalizeDAG to make sure custom legalized results had the right types. Reviewed By: kmclaughlin Differential Revision: https://reviews.llvm.org/D98968 --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 757d838ad3fe..5ab8d8a5d6f1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4118,7 +4118,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op, if (VT.isFloatingPoint()) { SDValue Cast = getSVESafeBitCast(VT, Gather, DAG); - return DAG.getMergeValues({Cast, Gather}, DL); + return DAG.getMergeValues({Cast, Gather.getValue(1)}, DL); } return Gather; -- GitLab From e27654f737da8e3a80d8c1e3509868ab7fb4265b Mon Sep 17 00:00:00 2001 From: Arnamoy Bhattacharyya Date: Fri, 19 Mar 2021 14:54:06 -0400 Subject: [PATCH 0187/1000] [Flang][OpenMP] Add more sema checks for ordered construct This patch fixes a bug to allow ordered construct within a non-worksharing loop, also adds more sema checks. Reviewed By: kiranchandramohan Differential Revision: https://reviews.llvm.org/D98733 --- flang/lib/Semantics/check-omp-structure.cpp | 12 ++- flang/lib/Semantics/check-omp-structure.h | 3 + flang/test/Semantics/omp-ordered-simd.f90 | 95 +++++++++++++++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 flang/test/Semantics/omp-ordered-simd.f90 diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index a3a3fd5d3524..3ed86132cbea 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -343,12 +343,22 @@ void OmpStructureChecker::Enter(const parser::OpenMPBlockConstruct &x) { void OmpStructureChecker::CheckIfDoOrderedClause( const parser::OmpBlockDirective &blkDirective) { if (blkDirective.v == llvm::omp::OMPD_ordered) { - if (!FindClauseParent(llvm::omp::Clause::OMPC_ordered)) { + // Loops + if (llvm::omp::doSet.test(GetContextParent().directive) && + !FindClauseParent(llvm::omp::Clause::OMPC_ordered)) { context_.Say(blkDirective.source, "The ORDERED clause must be present on the loop" " construct if any ORDERED region ever binds" " to a loop region arising from the loop construct."_err_en_US); } + // Other disallowed nestings, these directives do not support + // ordered clause in them, so no need to check + else if (llvm::omp::nestedOrderedErrSet.test( + GetContextParent().directive)) { + context_.Say(blkDirective.source, + "`ORDERED` region may not be closely nested inside of " + "`CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region."_err_en_US); + } } } diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index f11ddc66b401..0d11f72b5bc8 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -73,6 +73,9 @@ static OmpDirectiveSet simdSet{Directive::OMPD_distribute_parallel_do_simd, Directive::OMPD_teams_distribute_simd}; static OmpDirectiveSet taskGeneratingSet{ OmpDirectiveSet{Directive::OMPD_task} | taskloopSet}; +static OmpDirectiveSet nestedOrderedErrSet{Directive::OMPD_critical, + Directive::OMPD_ordered, Directive::OMPD_atomic, Directive::OMPD_task, + Directive::OMPD_taskloop}; static OmpClauseSet privateSet{ Clause::OMPC_private, Clause::OMPC_firstprivate, Clause::OMPC_lastprivate}; static OmpClauseSet privateReductionSet{ diff --git a/flang/test/Semantics/omp-ordered-simd.f90 b/flang/test/Semantics/omp-ordered-simd.f90 new file mode 100644 index 000000000000..d597191650e7 --- /dev/null +++ b/flang/test/Semantics/omp-ordered-simd.f90 @@ -0,0 +1,95 @@ +! RUN: %S/test_errors.sh %s %t %flang -fopenmp +! OpenMP Version 4.5 +! Various checks with the ordered construct + +SUBROUTINE WORK(I) + INTEGER I +END SUBROUTINE WORK + +SUBROUTINE ORDERED_GOOD(N) + INTEGER N, I, A(10), B(10), C(10) + !$OMP SIMD + DO I = 1,N + IF (I <= 10) THEN + !$OMP ORDERED SIMD + CALL WORK(I) + !$OMP END ORDERED + ENDIF + END DO + !$OMP END SIMD +END SUBROUTINE ORDERED_GOOD + +SUBROUTINE ORDERED_BAD(N) + INTEGER N, I, A(10), B(10), C(10) + + !$OMP DO SIMD + DO I = 1,N + IF (I <= 10) THEN + !ERROR: The ORDERED clause must be present on the loop construct if any ORDERED region ever binds to a loop region arising from the loop construct. + !$OMP ORDERED + CALL WORK(I) + !$OMP END ORDERED + ENDIF + END DO + !$OMP END DO SIMD + + !$OMP PARALLEL DO + DO I = 1,N + IF (I <= 10) THEN + !ERROR: The ORDERED clause must be present on the loop construct if any ORDERED region ever binds to a loop region arising from the loop construct. + !$OMP ORDERED + CALL WORK(I) + !$OMP END ORDERED + ENDIF + END DO + !$OMP END PARALLEL DO + + !$OMP CRITICAL + DO I = 1,N + IF (I <= 10) THEN + !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region. + !$OMP ORDERED + CALL WORK(I) + !$OMP END ORDERED + ENDIF + END DO + !$OMP END CRITICAL + + !$OMP CRITICAL + WRITE(*,*) I + !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region. + !$OMP ORDERED + CALL WORK(I) + !$OMP END ORDERED + !$OMP END CRITICAL + + !$OMP ORDERED + WRITE(*,*) I + IF (I <= 10) THEN + !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region. + !$OMP ORDERED + CALL WORK(I) + !$OMP END ORDERED + ENDIF + !$OMP END ORDERED + + !$OMP TASK + C = C - A * B + !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region. + !$OMP ORDERED + CALL WORK(I) + !$OMP END ORDERED + !$OMP END TASK + + !$OMP TASKLOOP + DO I = 1,N + IF (I <= 10) THEN + !ERROR: `ORDERED` region may not be closely nested inside of `CRITICAL`, `ORDERED`, explicit `TASK` or `TASKLOOP` region. + !$OMP ORDERED + CALL WORK(I) + !$OMP END ORDERED + ENDIF + END DO + !$OMP END TASKLOOP + +END SUBROUTINE ORDERED_BAD -- GitLab From 976eba51d0dea36b9e0e4f6edb09883490f79684 Mon Sep 17 00:00:00 2001 From: Louis Dionne Date: Thu, 18 Mar 2021 14:05:25 -0700 Subject: [PATCH 0188/1000] [libc++] NFCI: Remove dead code in the Lit configuration I was trying to fix something else and I stumbled upon several methods that are not used anymore in target_info.py. Differential Revision: https://reviews.llvm.org/D98896 --- libcxx/utils/libcxx/test/target_info.py | 58 ------------------------- 1 file changed, 58 deletions(-) diff --git a/libcxx/utils/libcxx/test/target_info.py b/libcxx/utils/libcxx/test/target_info.py index 45e3c4ae4d73..b128ab0f7726 100644 --- a/libcxx/utils/libcxx/test/target_info.py +++ b/libcxx/utils/libcxx/test/target_info.py @@ -21,18 +21,12 @@ class DefaultTargetInfo(object): self.full_config = full_config self.executor = None - def platform(self): - return sys.platform.lower().strip() - def is_windows(self): return False def is_mingw(self): return False - def is_darwin(self): - return False - def add_cxx_flags(self, flags): pass def add_cxx_compile_flags(self, flags): pass def add_cxx_link_flags(self, flags): pass @@ -53,33 +47,6 @@ class DarwinLocalTI(DefaultTargetInfo): def __init__(self, full_config): super(DarwinLocalTI, self).__init__(full_config) - def is_darwin(self): - return True - - def is_host_macosx(self): - name = lit.util.to_string(subprocess.check_output(['sw_vers', '-productName'])).strip() - return name == "Mac OS X" - - def get_macosx_version(self): - assert self.is_host_macosx() - version = lit.util.to_string(subprocess.check_output(['sw_vers', '-productVersion'])).strip() - version = re.sub(r'([0-9]+\.[0-9]+)(\..*)?', r'\1', version) - return version - - def get_sdk_version(self, name): - assert self.is_host_macosx() - cmd = ['xcrun', '--sdk', name, '--show-sdk-path'] - try: - out = subprocess.check_output(cmd).strip() - except OSError: - pass - - if not out: - self.full_config.lit_config.fatal( - "cannot infer sdk version with: %r" % cmd) - - return re.sub(r'.*/[^0-9]+([0-9.]+)\.sdk', r'\1', out) - def add_cxx_flags(self, flags): out, err, exit_code = executeCommand(['xcrun', '--show-sdk-path']) if exit_code != 0: @@ -120,31 +87,6 @@ class LinuxLocalTI(DefaultTargetInfo): def __init__(self, full_config): super(LinuxLocalTI, self).__init__(full_config) - def platform(self): - return 'linux' - - def _distribution(self): - try: - # linux_distribution is not available since Python 3.8 - # However, this function is only used to detect SLES 11, - # which is quite an old distribution that doesn't have - # Python 3.8. - return platform.linux_distribution() - except AttributeError: - return '', '', '' - - def platform_name(self): - name, _, _ = self._distribution() - # Some distros have spaces, e.g. 'SUSE Linux Enterprise Server' - # lit features can't have spaces - name = name.lower().strip().replace(' ', '-') - return name # Permitted to be None - - def platform_ver(self): - _, ver, _ = self._distribution() - ver = ver.lower().strip().replace(' ', '-') - return ver # Permitted to be None. - def add_cxx_compile_flags(self, flags): flags += ['-D__STDC_FORMAT_MACROS', '-D__STDC_LIMIT_MACROS', -- GitLab From 9406d43138811ac4dfd0ab31434f65a649bc882e Mon Sep 17 00:00:00 2001 From: Jim Ingham Date: Fri, 19 Mar 2021 11:56:46 -0700 Subject: [PATCH 0189/1000] Make the stop-on-sharedlibrary-events setting work. We weren't taking into account the "m_should_stop" setting that the synchronous breakpoint callback had already set when we did PerformAction in the StopInfoBreakpoint. So we didn't obey its instructions when it told us to stop. Fixed that and added some tests both for when we just have the setting, and when we have the setting AND other breakpoints at the shared library load notification breakpoint address. Differential Revision: https://reviews.llvm.org/D98914 --- lldb/source/Breakpoint/BreakpointOptions.cpp | 9 +- lldb/source/Target/StopInfo.cpp | 29 +++++- .../stop-on-sharedlibrary-load/Makefile | 16 ++++ .../TestStopOnSharedlibraryEvents.py | 96 +++++++++++++++++++ .../stop-on-sharedlibrary-load/a.cpp | 6 ++ .../stop-on-sharedlibrary-load/b.cpp | 6 ++ .../stop-on-sharedlibrary-load/main.cpp | 27 ++++++ 7 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile create mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py create mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/a.cpp create mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/b.cpp create mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/main.cpp diff --git a/lldb/source/Breakpoint/BreakpointOptions.cpp b/lldb/source/Breakpoint/BreakpointOptions.cpp index 2fdb53e52723..24427835980e 100644 --- a/lldb/source/Breakpoint/BreakpointOptions.cpp +++ b/lldb/source/Breakpoint/BreakpointOptions.cpp @@ -453,9 +453,12 @@ bool BreakpointOptions::InvokeCallback(StoppointCallbackContext *context, : nullptr, context, break_id, break_loc_id); } else if (IsCallbackSynchronous()) { - // If a synchronous callback is called at async time, it should not say - // to stop. - return false; + // If a synchronous callback is called at async time, we will say we + // should stop, we're really expression no opinion about stopping, and + // the StopInfoBreakpoint::PerformAction will note whether an async + // callback had already made a claim to stop or not based on the incoming + // values of m_should_stop & m_should_stop_is_valid. + return true; } } return true; diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index 7e830c6e2bed..1cb582e83cc1 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -305,6 +305,20 @@ protected: // location said we should stop. But that's better than not running // all the callbacks. + // There's one other complication here. We may have run an async + // breakpoint callback that said we should stop. We only want to + // override that if another breakpoint action says we shouldn't + // stop. If nobody else has an opinion, then we should stop if the + // async callback says we should. An example of this is the async + // shared library load notification breakpoint and the setting + // stop-on-sharedlibrary-events. + // We'll keep the async value in async_should_stop, and track whether + // anyone said we should NOT stop in actually_said_continue. + bool async_should_stop = false; + if (m_should_stop_is_valid) + async_should_stop = m_should_stop; + bool actually_said_continue = false; + m_should_stop = false; // We don't select threads as we go through them testing breakpoint @@ -422,9 +436,10 @@ protected: bool precondition_result = bp_loc_sp->GetBreakpoint().EvaluatePrecondition(context); - if (!precondition_result) + if (!precondition_result) { + actually_said_continue = true; continue; - + } // Next run the condition for the breakpoint. If that says we // should stop, then we'll run the callback for the breakpoint. If // the callback says we shouldn't stop that will win. @@ -462,6 +477,7 @@ protected: // the condition fails. We've already bumped it by the time // we get here, so undo the bump: bp_loc_sp->UndoBumpHitCount(); + actually_said_continue = true; continue; } } @@ -504,6 +520,9 @@ protected: if (callback_says_stop && auto_continue_says_stop) m_should_stop = true; + else + actually_said_continue = true; + // If we are going to stop for this breakpoint, then remove the // breakpoint. @@ -517,9 +536,15 @@ protected: // here. if (HasTargetRunSinceMe()) { m_should_stop = false; + actually_said_continue = true; break; } } + // At this point if nobody actually told us to continue, we should + // give the async breakpoint callback a chance to weigh in: + if (!actually_said_continue && !m_should_stop) { + m_should_stop = async_should_stop; + } } // We've figured out what this stop wants to do, so mark it as valid so // we don't compute it again. diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile new file mode 100644 index 000000000000..e87808bd222d --- /dev/null +++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile @@ -0,0 +1,16 @@ +CXX_SOURCES := main.cpp +USE_LIBDL := 1 + +a.out: lib_a + +include Makefile.rules + +lib_a: + $(MAKE) -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=load_a + +lib_b: + $(MAKE) -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=load_b + + diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py new file mode 100644 index 000000000000..98c4eb89ff54 --- /dev/null +++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py @@ -0,0 +1,96 @@ +""" Test that stop-on-sharedlibrary-events works and cooperates with breakpoints. """ +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + +class TestStopOnSharedlibraryEvents(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + @skipIfRemote + @skipIfWindows + @no_debug_info_test + def test_stopping_breakpoints(self): + self.do_test() + + def test_auto_continue(self): + def auto_continue(bkpt): + bkpt.SetAutoContinue(True) + self.do_test(auto_continue) + + def test_failing_condition(self): + def condition(bkpt): + bkpt.SetCondition("1 == 2") + self.do_test(condition) + + def test_continue_callback(self): + def bkpt_callback(bkpt): + bkpt.SetScriptCallbackBody("return False") + self.do_test(bkpt_callback) + + def do_test(self, bkpt_modifier = None): + self.build() + main_spec = lldb.SBFileSpec("main.cpp") + # Launch and stop before the dlopen call. + target, process, _, _ = lldbutil.run_to_source_breakpoint(self, + "// Set a breakpoint here", main_spec) + + # Now turn on shared library events, continue and make sure we stop for the event. + self.runCmd("settings set target.process.stop-on-sharedlibrary-events 1") + self.addTearDownHook(lambda: self.runCmd( + "settings set target.process.stop-on-sharedlibrary-events 0")) + + # Since I don't know how to check that we are at the "right place" to stop for + # shared library events, make an breakpoint after the load is done and + # make sure we don't stop there: + backstop_bkpt_1 = target.BreakpointCreateBySourceRegex("Set another here - we should not hit this one", main_spec) + self.assertGreater(backstop_bkpt_1.GetNumLocations(), 0, "Set our second breakpoint") + + process.Continue() + self.assertEqual(process.GetState(), lldb.eStateStopped, "We didn't stop for the load") + self.assertEqual(backstop_bkpt_1.GetHitCount(), 0, "Hit our backstop breakpoint") + + # We should be stopped after the library is loaded, check that: + found_it = False + for module in target.modules: + if module.file.basename.find("load_a") > -1: + found_it = True + break + self.assertTrue(found_it, "Found the loaded module.") + + # Now capture the place where we stopped so we can set a breakpoint and make + # sure the breakpoint there works correctly: + load_address = process.GetSelectedThread().frames[0].addr + load_bkpt = target.BreakpointCreateBySBAddress(load_address) + self.assertGreater(load_bkpt.GetNumLocations(), 0, "Set the load breakpoint") + + backstop_bkpt_1.SetEnabled(False) + + backstop_bkpt_2 = target.BreakpointCreateBySourceRegex("Set a third here - we should not hit this one", main_spec) + self.assertGreater(backstop_bkpt_2.GetNumLocations(), 0, "Set our third breakpoint") + + if bkpt_modifier == None: + process.Continue() + self.assertEqual(process.GetState(), lldb.eStateStopped, "We didn't stop for the load") + self.assertEqual(backstop_bkpt_2.GetHitCount(), 0, "Hit our backstop breakpoint") + + thread = process.GetSelectedThread() + self.assertEqual(thread.stop_reason, lldb.eStopReasonBreakpoint, "We attributed the stop to the breakpoint") + self.assertEqual(thread.GetStopReasonDataCount(), 2, "Only hit one breakpoint") + bkpt_no = thread.GetStopReasonDataAtIndex(0) + self.assertEqual(bkpt_no, load_bkpt.id, "We hit our breakpoint at the load address") + else: + bkpt_modifier(load_bkpt) + process.Continue() + self.assertEqual(process.GetState(), lldb.eStateStopped, "We didn't stop") + thread = process.GetSelectedThread() + self.assertEqual(thread.stop_reason, lldb.eStopReasonBreakpoint, "We didn't hit some breakpoint") + self.assertEqual(thread.GetStopReasonDataCount(), 2, "Only hit one breakpoint") + bkpt_no = thread.GetStopReasonDataAtIndex(0) + self.assertEqual(bkpt_no, backstop_bkpt_2.id, "We continued to the right breakpoint") + + + + + diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/a.cpp b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/a.cpp new file mode 100644 index 000000000000..b7b702c5d62d --- /dev/null +++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/a.cpp @@ -0,0 +1,6 @@ +extern int a_has_a_function(); + +int +a_has_a_function() { + return 10; +} diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/b.cpp b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/b.cpp new file mode 100644 index 000000000000..5a347e60db3a --- /dev/null +++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/b.cpp @@ -0,0 +1,6 @@ +extern int b_has_a_function(); + +int +b_has_a_function() { + return 100; +} diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/main.cpp b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/main.cpp new file mode 100644 index 000000000000..96b1e1df445b --- /dev/null +++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/main.cpp @@ -0,0 +1,27 @@ +#include "dylib.h" +#include +#include +#include +#include + +int main(int argc, char const *argv[]) { + const char *a_name = "load_a"; + void *a_dylib_handle = NULL; + + a_dylib_handle = dylib_open(a_name); // Set a breakpoint here. + if (a_dylib_handle == NULL) { // Set another here - we should not hit this one + fprintf(stderr, "%s\n", dylib_last_error()); + exit(1); + } + + const char *b_name = "load_b"; + void *b_dylib_handle = NULL; + + b_dylib_handle = dylib_open(b_name); + if (b_dylib_handle == NULL) { // Set a third here - we should not hit this one + fprintf(stderr, "%s\n", dylib_last_error()); + exit(1); + } + + return 0; +} -- GitLab From a8d62fc8ff1c836e16cfb1a510ee8063ac2652ff Mon Sep 17 00:00:00 2001 From: Jim Ingham Date: Fri, 19 Mar 2021 12:05:16 -0700 Subject: [PATCH 0190/1000] Skip all the tests for Windows. --- .../TestStopOnSharedlibraryEvents.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py index 98c4eb89ff54..d19a790f7830 100644 --- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py +++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py @@ -14,16 +14,25 @@ class TestStopOnSharedlibraryEvents(TestBase): def test_stopping_breakpoints(self): self.do_test() + @skipIfRemote + @skipIfWindows + @no_debug_info_test def test_auto_continue(self): def auto_continue(bkpt): bkpt.SetAutoContinue(True) self.do_test(auto_continue) + @skipIfRemote + @skipIfWindows + @no_debug_info_test def test_failing_condition(self): def condition(bkpt): bkpt.SetCondition("1 == 2") self.do_test(condition) + @skipIfRemote + @skipIfWindows + @no_debug_info_test def test_continue_callback(self): def bkpt_callback(bkpt): bkpt.SetScriptCallbackBody("return False") -- GitLab From 62f9c3358b81d9e9691cc90da2f9b1cf93682a79 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 19 Mar 2021 15:05:52 -0400 Subject: [PATCH 0191/1000] [SLP] add tests for min/max reductions that use intrinsics; NFC --- .../SLPVectorizer/X86/horizontal-minmax.ll | 245 +++++++++++++++++- 1 file changed, 239 insertions(+), 6 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index 40962860b731..c4184eefbd4e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -9,6 +9,11 @@ @arrp = local_unnamed_addr global [32 x i32*] zeroinitializer, align 16 @var = global i32 zeroinitializer, align 8 +declare i32 @llvm.smax.i32(i32, i32) +declare i16 @llvm.smin.i16(i16, i16) +declare i64 @llvm.umax.i64(i64 %mh, i64) +declare i8 @llvm.umin.i8(i8, i8) + define i32 @maxi8(i32) { ; CHECK-LABEL: @maxi8( ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr to <8 x i32>*), align 16 @@ -198,8 +203,8 @@ define i32 @maxi32(i32) { ret i32 %95 } -; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select -; with fastmath on the select. +; Note: legacy test - InstCombine creates maxnum intrinsics for fcmp+select with fastmath on the select. + define float @maxf8(float) { ; DEFAULT-LABEL: @maxf8( ; DEFAULT-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 @@ -277,8 +282,8 @@ define float @maxf8(float) { ret float %23 } -; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select -; with fastmath on the select. +; Note: legacy test - maxnum intrinsics match what InstCombine creates for fcmp+select with fastmath on the select. + define float @maxf16(float) { ; DEFAULT-LABEL: @maxf16( ; DEFAULT-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 @@ -428,8 +433,8 @@ define float @maxf16(float) { ret float %47 } -; FIXME: Use fmaxnum intrinsics to match what InstCombine creates for fcmp+select -; with fastmath on the select. +; Note: legacy test - InstCombine creates maxnum intrinsics for fcmp+select with fastmath on the select. + define float @maxf32(float) { ; DEFAULT-LABEL: @maxf32( ; DEFAULT-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([32 x float], [32 x float]* @arr1, i64 0, i64 0), align 16 @@ -1001,3 +1006,231 @@ define i32* @maxp8(i32) { %23 = select i1 %22, i32* %20, i32* %21 ret i32* %23 } + +define i32 @smax_intrinsic_rdx_v8i32(i32* %p0) { +; CHECK-LABEL: @smax_intrinsic_rdx_v8i32( +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 4 +; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 5 +; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 6 +; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 7 +; CHECK-NEXT: [[T0:%.*]] = load i32, i32* [[P0]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load i32, i32* [[P1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[P2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load i32, i32* [[P3]], align 4 +; CHECK-NEXT: [[T4:%.*]] = load i32, i32* [[P4]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load i32, i32* [[P5]], align 4 +; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[P6]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load i32, i32* [[P7]], align 4 +; CHECK-NEXT: [[M10:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T1]], i32 [[T0]]) +; CHECK-NEXT: [[M32:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T3]], i32 [[T2]]) +; CHECK-NEXT: [[M54:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T5]], i32 [[T4]]) +; CHECK-NEXT: [[M76:%.*]] = tail call i32 @llvm.smax.i32(i32 [[T7]], i32 [[T6]]) +; CHECK-NEXT: [[M3210:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M32]], i32 [[M10]]) +; CHECK-NEXT: [[M7654:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M76]], i32 [[M54]]) +; CHECK-NEXT: [[M:%.*]] = tail call i32 @llvm.smax.i32(i32 [[M7654]], i32 [[M3210]]) +; CHECK-NEXT: ret i32 [[M]] +; + %p1 = getelementptr inbounds i32, i32* %p0, i64 1 + %p2 = getelementptr inbounds i32, i32* %p0, i64 2 + %p3 = getelementptr inbounds i32, i32* %p0, i64 3 + %p4 = getelementptr inbounds i32, i32* %p0, i64 4 + %p5 = getelementptr inbounds i32, i32* %p0, i64 5 + %p6 = getelementptr inbounds i32, i32* %p0, i64 6 + %p7 = getelementptr inbounds i32, i32* %p0, i64 7 + %t0 = load i32, i32* %p0, align 4 + %t1 = load i32, i32* %p1, align 4 + %t2 = load i32, i32* %p2, align 4 + %t3 = load i32, i32* %p3, align 4 + %t4 = load i32, i32* %p4, align 4 + %t5 = load i32, i32* %p5, align 4 + %t6 = load i32, i32* %p6, align 4 + %t7 = load i32, i32* %p7, align 4 + %m10 = tail call i32 @llvm.smax.i32(i32 %t1, i32 %t0) + %m32 = tail call i32 @llvm.smax.i32(i32 %t3, i32 %t2) + %m54 = tail call i32 @llvm.smax.i32(i32 %t5, i32 %t4) + %m76 = tail call i32 @llvm.smax.i32(i32 %t7, i32 %t6) + %m3210 = tail call i32 @llvm.smax.i32(i32 %m32, i32 %m10) + %m7654 = tail call i32 @llvm.smax.i32(i32 %m76, i32 %m54) + %m = tail call i32 @llvm.smax.i32(i32 %m7654, i32 %m3210) + ret i32 %m +} + +define i16 @smin_intrinsic_rdx_v8i16(i16* %p0) { +; CHECK-LABEL: @smin_intrinsic_rdx_v8i16( +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 4 +; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 5 +; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 6 +; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 7 +; CHECK-NEXT: [[T0:%.*]] = load i16, i16* [[P0]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load i16, i16* [[P1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load i16, i16* [[P2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load i16, i16* [[P3]], align 4 +; CHECK-NEXT: [[T4:%.*]] = load i16, i16* [[P4]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load i16, i16* [[P5]], align 4 +; CHECK-NEXT: [[T6:%.*]] = load i16, i16* [[P6]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load i16, i16* [[P7]], align 4 +; CHECK-NEXT: [[M10:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T1]], i16 [[T0]]) +; CHECK-NEXT: [[M32:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T3]], i16 [[T2]]) +; CHECK-NEXT: [[M54:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T5]], i16 [[T4]]) +; CHECK-NEXT: [[M76:%.*]] = tail call i16 @llvm.smin.i16(i16 [[T7]], i16 [[T6]]) +; CHECK-NEXT: [[M3210:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M32]], i16 [[M10]]) +; CHECK-NEXT: [[M7654:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M76]], i16 [[M54]]) +; CHECK-NEXT: [[M:%.*]] = tail call i16 @llvm.smin.i16(i16 [[M7654]], i16 [[M3210]]) +; CHECK-NEXT: ret i16 [[M]] +; + %p1 = getelementptr inbounds i16, i16* %p0, i64 1 + %p2 = getelementptr inbounds i16, i16* %p0, i64 2 + %p3 = getelementptr inbounds i16, i16* %p0, i64 3 + %p4 = getelementptr inbounds i16, i16* %p0, i64 4 + %p5 = getelementptr inbounds i16, i16* %p0, i64 5 + %p6 = getelementptr inbounds i16, i16* %p0, i64 6 + %p7 = getelementptr inbounds i16, i16* %p0, i64 7 + %t0 = load i16, i16* %p0, align 4 + %t1 = load i16, i16* %p1, align 4 + %t2 = load i16, i16* %p2, align 4 + %t3 = load i16, i16* %p3, align 4 + %t4 = load i16, i16* %p4, align 4 + %t5 = load i16, i16* %p5, align 4 + %t6 = load i16, i16* %p6, align 4 + %t7 = load i16, i16* %p7, align 4 + %m10 = tail call i16 @llvm.smin.i16(i16 %t1, i16 %t0) + %m32 = tail call i16 @llvm.smin.i16(i16 %t3, i16 %t2) + %m54 = tail call i16 @llvm.smin.i16(i16 %t5, i16 %t4) + %m76 = tail call i16 @llvm.smin.i16(i16 %t7, i16 %t6) + %m3210 = tail call i16 @llvm.smin.i16(i16 %m32, i16 %m10) + %m7654 = tail call i16 @llvm.smin.i16(i16 %m76, i16 %m54) + %m = tail call i16 @llvm.smin.i16(i16 %m7654, i16 %m3210) + ret i16 %m +} + +define i64 @umax_intrinsic_rdx_v4i64(i64* %p0) { +; CHECK-LABEL: @umax_intrinsic_rdx_v4i64( +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i64, i64* [[P0:%.*]], i64 1 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i64, i64* [[P0]], i64 3 +; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[P0]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[P1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load i64, i64* [[P2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load i64, i64* [[P3]], align 4 +; CHECK-NEXT: [[M10:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T1]], i64 [[T0]]) +; CHECK-NEXT: [[M32:%.*]] = tail call i64 @llvm.umax.i64(i64 [[T3]], i64 [[T2]]) +; CHECK-NEXT: [[M:%.*]] = tail call i64 @llvm.umax.i64(i64 [[M32]], i64 [[M10]]) +; CHECK-NEXT: ret i64 [[M]] +; + %p1 = getelementptr inbounds i64, i64* %p0, i64 1 + %p2 = getelementptr inbounds i64, i64* %p0, i64 2 + %p3 = getelementptr inbounds i64, i64* %p0, i64 3 + %t0 = load i64, i64* %p0, align 4 + %t1 = load i64, i64* %p1, align 4 + %t2 = load i64, i64* %p2, align 4 + %t3 = load i64, i64* %p3, align 4 + %m10 = tail call i64 @llvm.umax.i64(i64 %t1, i64 %t0) + %m32 = tail call i64 @llvm.umax.i64(i64 %t3, i64 %t2) + %m = tail call i64 @llvm.umax.i64(i64 %m32, i64 %m10) + ret i64 %m +} + +define i8 @umin_intrinsic_rdx_v16i8(i8* %p0) { +; CHECK-LABEL: @umin_intrinsic_rdx_v16i8( +; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 4 +; CHECK-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 +; CHECK-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 +; CHECK-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 +; CHECK-NEXT: [[P8:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 8 +; CHECK-NEXT: [[P9:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 9 +; CHECK-NEXT: [[PA:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 10 +; CHECK-NEXT: [[PB:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 11 +; CHECK-NEXT: [[PC:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 12 +; CHECK-NEXT: [[PD:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 +; CHECK-NEXT: [[PE:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 +; CHECK-NEXT: [[PF:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 +; CHECK-NEXT: [[T0:%.*]] = load i8, i8* [[P0]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load i8, i8* [[P1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load i8, i8* [[P2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load i8, i8* [[P3]], align 4 +; CHECK-NEXT: [[T4:%.*]] = load i8, i8* [[P4]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load i8, i8* [[P5]], align 4 +; CHECK-NEXT: [[T6:%.*]] = load i8, i8* [[P6]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load i8, i8* [[P7]], align 4 +; CHECK-NEXT: [[T8:%.*]] = load i8, i8* [[P8]], align 4 +; CHECK-NEXT: [[T9:%.*]] = load i8, i8* [[P9]], align 4 +; CHECK-NEXT: [[TA:%.*]] = load i8, i8* [[PA]], align 4 +; CHECK-NEXT: [[TB:%.*]] = load i8, i8* [[PB]], align 4 +; CHECK-NEXT: [[TC:%.*]] = load i8, i8* [[PC]], align 4 +; CHECK-NEXT: [[TD:%.*]] = load i8, i8* [[PD]], align 4 +; CHECK-NEXT: [[TE:%.*]] = load i8, i8* [[PE]], align 4 +; CHECK-NEXT: [[TF:%.*]] = load i8, i8* [[PF]], align 4 +; CHECK-NEXT: [[M10:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T1]], i8 [[T0]]) +; CHECK-NEXT: [[M32:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T3]], i8 [[T2]]) +; CHECK-NEXT: [[M54:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T5]], i8 [[T4]]) +; CHECK-NEXT: [[M76:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T7]], i8 [[T6]]) +; CHECK-NEXT: [[M98:%.*]] = tail call i8 @llvm.umin.i8(i8 [[T9]], i8 [[T8]]) +; CHECK-NEXT: [[MBA:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TB]], i8 [[TA]]) +; CHECK-NEXT: [[MDC:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TD]], i8 [[TC]]) +; CHECK-NEXT: [[MFE:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TF]], i8 [[TE]]) +; CHECK-NEXT: [[M3210:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M32]], i8 [[M10]]) +; CHECK-NEXT: [[M7654:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M76]], i8 [[M54]]) +; CHECK-NEXT: [[MDC98:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MDC]], i8 [[M98]]) +; CHECK-NEXT: [[MFEBA:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MFE]], i8 [[MBA]]) +; CHECK-NEXT: [[ML:%.*]] = tail call i8 @llvm.umin.i8(i8 [[M3210]], i8 [[M7654]]) +; CHECK-NEXT: [[MH:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MFEBA]], i8 [[MDC98]]) +; CHECK-NEXT: [[M:%.*]] = tail call i8 @llvm.umin.i8(i8 [[MH]], i8 [[ML]]) +; CHECK-NEXT: ret i8 [[M]] +; + %p1 = getelementptr inbounds i8, i8* %p0, i64 1 + %p2 = getelementptr inbounds i8, i8* %p0, i64 2 + %p3 = getelementptr inbounds i8, i8* %p0, i64 3 + %p4 = getelementptr inbounds i8, i8* %p0, i64 4 + %p5 = getelementptr inbounds i8, i8* %p0, i64 5 + %p6 = getelementptr inbounds i8, i8* %p0, i64 6 + %p7 = getelementptr inbounds i8, i8* %p0, i64 7 + %p8 = getelementptr inbounds i8, i8* %p0, i64 8 + %p9 = getelementptr inbounds i8, i8* %p0, i64 9 + %pa = getelementptr inbounds i8, i8* %p0, i64 10 + %pb = getelementptr inbounds i8, i8* %p0, i64 11 + %pc = getelementptr inbounds i8, i8* %p0, i64 12 + %pd = getelementptr inbounds i8, i8* %p0, i64 13 + %pe = getelementptr inbounds i8, i8* %p0, i64 14 + %pf = getelementptr inbounds i8, i8* %p0, i64 15 + %t0 = load i8, i8* %p0, align 4 + %t1 = load i8, i8* %p1, align 4 + %t2 = load i8, i8* %p2, align 4 + %t3 = load i8, i8* %p3, align 4 + %t4 = load i8, i8* %p4, align 4 + %t5 = load i8, i8* %p5, align 4 + %t6 = load i8, i8* %p6, align 4 + %t7 = load i8, i8* %p7, align 4 + %t8 = load i8, i8* %p8, align 4 + %t9 = load i8, i8* %p9, align 4 + %ta = load i8, i8* %pa, align 4 + %tb = load i8, i8* %pb, align 4 + %tc = load i8, i8* %pc, align 4 + %td = load i8, i8* %pd, align 4 + %te = load i8, i8* %pe, align 4 + %tf = load i8, i8* %pf, align 4 + %m10 = tail call i8 @llvm.umin.i8(i8 %t1, i8 %t0) + %m32 = tail call i8 @llvm.umin.i8(i8 %t3, i8 %t2) + %m54 = tail call i8 @llvm.umin.i8(i8 %t5, i8 %t4) + %m76 = tail call i8 @llvm.umin.i8(i8 %t7, i8 %t6) + %m98 = tail call i8 @llvm.umin.i8(i8 %t9, i8 %t8) + %mba = tail call i8 @llvm.umin.i8(i8 %tb, i8 %ta) + %mdc = tail call i8 @llvm.umin.i8(i8 %td, i8 %tc) + %mfe = tail call i8 @llvm.umin.i8(i8 %tf, i8 %te) + %m3210 = tail call i8 @llvm.umin.i8(i8 %m32, i8 %m10) + %m7654 = tail call i8 @llvm.umin.i8(i8 %m76, i8 %m54) + %mdc98 = tail call i8 @llvm.umin.i8(i8 %mdc, i8 %m98) + %mfeba = tail call i8 @llvm.umin.i8(i8 %mfe, i8 %mba) + %ml = tail call i8 @llvm.umin.i8(i8 %m3210, i8 %m7654) + %mh = tail call i8 @llvm.umin.i8(i8 %mfeba, i8 %mdc98) + %m = tail call i8 @llvm.umin.i8(i8 %mh, i8 %ml) + ret i8 %m +} -- GitLab From 2fc47afed2182dee206523548e40ed4bd31877c9 Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Fri, 19 Mar 2021 15:09:53 -0400 Subject: [PATCH 0192/1000] [SLP] remove unnecessary characters in test; NFC Glitch that crept in with 62f9c3358b81 --- llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index c4184eefbd4e..433d79db490c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -11,7 +11,7 @@ declare i32 @llvm.smax.i32(i32, i32) declare i16 @llvm.smin.i16(i16, i16) -declare i64 @llvm.umax.i64(i64 %mh, i64) +declare i64 @llvm.umax.i64(i64, i64) declare i8 @llvm.umin.i8(i8, i8) define i32 @maxi8(i32) { -- GitLab From e8e07b3a5e6032edeed559db448402094cff31bf Mon Sep 17 00:00:00 2001 From: Jim Ingham Date: Fri, 19 Mar 2021 12:38:23 -0700 Subject: [PATCH 0193/1000] Revert "Skip all the tests for Windows." This reverts commit a8d62fc8ff1c836e16cfb1a510ee8063ac2652ff. --- .../TestStopOnSharedlibraryEvents.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py index d19a790f7830..98c4eb89ff54 100644 --- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py +++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py @@ -14,25 +14,16 @@ class TestStopOnSharedlibraryEvents(TestBase): def test_stopping_breakpoints(self): self.do_test() - @skipIfRemote - @skipIfWindows - @no_debug_info_test def test_auto_continue(self): def auto_continue(bkpt): bkpt.SetAutoContinue(True) self.do_test(auto_continue) - @skipIfRemote - @skipIfWindows - @no_debug_info_test def test_failing_condition(self): def condition(bkpt): bkpt.SetCondition("1 == 2") self.do_test(condition) - @skipIfRemote - @skipIfWindows - @no_debug_info_test def test_continue_callback(self): def bkpt_callback(bkpt): bkpt.SetScriptCallbackBody("return False") -- GitLab From 9d081a7ffe5c2f9575f77bedd6cbf4385287aeec Mon Sep 17 00:00:00 2001 From: Jim Ingham Date: Fri, 19 Mar 2021 12:38:41 -0700 Subject: [PATCH 0194/1000] Revert "Make the stop-on-sharedlibrary-events setting work." This reverts commit 9406d43138811ac4dfd0ab31434f65a649bc882e. I messed up a test, and when I got it right it was failing. The changed logic doesn't work quite right (now the async callback called at sync time is forcing us to stop. I need to be a little more careful about that. --- lldb/source/Breakpoint/BreakpointOptions.cpp | 9 +- lldb/source/Target/StopInfo.cpp | 29 +----- .../stop-on-sharedlibrary-load/Makefile | 16 ---- .../TestStopOnSharedlibraryEvents.py | 96 ------------------- .../stop-on-sharedlibrary-load/a.cpp | 6 -- .../stop-on-sharedlibrary-load/b.cpp | 6 -- .../stop-on-sharedlibrary-load/main.cpp | 27 ------ 7 files changed, 5 insertions(+), 184 deletions(-) delete mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile delete mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py delete mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/a.cpp delete mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/b.cpp delete mode 100644 lldb/test/API/functionalities/stop-on-sharedlibrary-load/main.cpp diff --git a/lldb/source/Breakpoint/BreakpointOptions.cpp b/lldb/source/Breakpoint/BreakpointOptions.cpp index 24427835980e..2fdb53e52723 100644 --- a/lldb/source/Breakpoint/BreakpointOptions.cpp +++ b/lldb/source/Breakpoint/BreakpointOptions.cpp @@ -453,12 +453,9 @@ bool BreakpointOptions::InvokeCallback(StoppointCallbackContext *context, : nullptr, context, break_id, break_loc_id); } else if (IsCallbackSynchronous()) { - // If a synchronous callback is called at async time, we will say we - // should stop, we're really expression no opinion about stopping, and - // the StopInfoBreakpoint::PerformAction will note whether an async - // callback had already made a claim to stop or not based on the incoming - // values of m_should_stop & m_should_stop_is_valid. - return true; + // If a synchronous callback is called at async time, it should not say + // to stop. + return false; } } return true; diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp index 1cb582e83cc1..7e830c6e2bed 100644 --- a/lldb/source/Target/StopInfo.cpp +++ b/lldb/source/Target/StopInfo.cpp @@ -305,20 +305,6 @@ protected: // location said we should stop. But that's better than not running // all the callbacks. - // There's one other complication here. We may have run an async - // breakpoint callback that said we should stop. We only want to - // override that if another breakpoint action says we shouldn't - // stop. If nobody else has an opinion, then we should stop if the - // async callback says we should. An example of this is the async - // shared library load notification breakpoint and the setting - // stop-on-sharedlibrary-events. - // We'll keep the async value in async_should_stop, and track whether - // anyone said we should NOT stop in actually_said_continue. - bool async_should_stop = false; - if (m_should_stop_is_valid) - async_should_stop = m_should_stop; - bool actually_said_continue = false; - m_should_stop = false; // We don't select threads as we go through them testing breakpoint @@ -436,10 +422,9 @@ protected: bool precondition_result = bp_loc_sp->GetBreakpoint().EvaluatePrecondition(context); - if (!precondition_result) { - actually_said_continue = true; + if (!precondition_result) continue; - } + // Next run the condition for the breakpoint. If that says we // should stop, then we'll run the callback for the breakpoint. If // the callback says we shouldn't stop that will win. @@ -477,7 +462,6 @@ protected: // the condition fails. We've already bumped it by the time // we get here, so undo the bump: bp_loc_sp->UndoBumpHitCount(); - actually_said_continue = true; continue; } } @@ -520,9 +504,6 @@ protected: if (callback_says_stop && auto_continue_says_stop) m_should_stop = true; - else - actually_said_continue = true; - // If we are going to stop for this breakpoint, then remove the // breakpoint. @@ -536,15 +517,9 @@ protected: // here. if (HasTargetRunSinceMe()) { m_should_stop = false; - actually_said_continue = true; break; } } - // At this point if nobody actually told us to continue, we should - // give the async breakpoint callback a chance to weigh in: - if (!actually_said_continue && !m_should_stop) { - m_should_stop = async_should_stop; - } } // We've figured out what this stop wants to do, so mark it as valid so // we don't compute it again. diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile deleted file mode 100644 index e87808bd222d..000000000000 --- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -CXX_SOURCES := main.cpp -USE_LIBDL := 1 - -a.out: lib_a - -include Makefile.rules - -lib_a: - $(MAKE) -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=load_a - -lib_b: - $(MAKE) -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=load_b - - diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py deleted file mode 100644 index 98c4eb89ff54..000000000000 --- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/TestStopOnSharedlibraryEvents.py +++ /dev/null @@ -1,96 +0,0 @@ -""" Test that stop-on-sharedlibrary-events works and cooperates with breakpoints. """ -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - -class TestStopOnSharedlibraryEvents(TestBase): - - mydir = TestBase.compute_mydir(__file__) - - @skipIfRemote - @skipIfWindows - @no_debug_info_test - def test_stopping_breakpoints(self): - self.do_test() - - def test_auto_continue(self): - def auto_continue(bkpt): - bkpt.SetAutoContinue(True) - self.do_test(auto_continue) - - def test_failing_condition(self): - def condition(bkpt): - bkpt.SetCondition("1 == 2") - self.do_test(condition) - - def test_continue_callback(self): - def bkpt_callback(bkpt): - bkpt.SetScriptCallbackBody("return False") - self.do_test(bkpt_callback) - - def do_test(self, bkpt_modifier = None): - self.build() - main_spec = lldb.SBFileSpec("main.cpp") - # Launch and stop before the dlopen call. - target, process, _, _ = lldbutil.run_to_source_breakpoint(self, - "// Set a breakpoint here", main_spec) - - # Now turn on shared library events, continue and make sure we stop for the event. - self.runCmd("settings set target.process.stop-on-sharedlibrary-events 1") - self.addTearDownHook(lambda: self.runCmd( - "settings set target.process.stop-on-sharedlibrary-events 0")) - - # Since I don't know how to check that we are at the "right place" to stop for - # shared library events, make an breakpoint after the load is done and - # make sure we don't stop there: - backstop_bkpt_1 = target.BreakpointCreateBySourceRegex("Set another here - we should not hit this one", main_spec) - self.assertGreater(backstop_bkpt_1.GetNumLocations(), 0, "Set our second breakpoint") - - process.Continue() - self.assertEqual(process.GetState(), lldb.eStateStopped, "We didn't stop for the load") - self.assertEqual(backstop_bkpt_1.GetHitCount(), 0, "Hit our backstop breakpoint") - - # We should be stopped after the library is loaded, check that: - found_it = False - for module in target.modules: - if module.file.basename.find("load_a") > -1: - found_it = True - break - self.assertTrue(found_it, "Found the loaded module.") - - # Now capture the place where we stopped so we can set a breakpoint and make - # sure the breakpoint there works correctly: - load_address = process.GetSelectedThread().frames[0].addr - load_bkpt = target.BreakpointCreateBySBAddress(load_address) - self.assertGreater(load_bkpt.GetNumLocations(), 0, "Set the load breakpoint") - - backstop_bkpt_1.SetEnabled(False) - - backstop_bkpt_2 = target.BreakpointCreateBySourceRegex("Set a third here - we should not hit this one", main_spec) - self.assertGreater(backstop_bkpt_2.GetNumLocations(), 0, "Set our third breakpoint") - - if bkpt_modifier == None: - process.Continue() - self.assertEqual(process.GetState(), lldb.eStateStopped, "We didn't stop for the load") - self.assertEqual(backstop_bkpt_2.GetHitCount(), 0, "Hit our backstop breakpoint") - - thread = process.GetSelectedThread() - self.assertEqual(thread.stop_reason, lldb.eStopReasonBreakpoint, "We attributed the stop to the breakpoint") - self.assertEqual(thread.GetStopReasonDataCount(), 2, "Only hit one breakpoint") - bkpt_no = thread.GetStopReasonDataAtIndex(0) - self.assertEqual(bkpt_no, load_bkpt.id, "We hit our breakpoint at the load address") - else: - bkpt_modifier(load_bkpt) - process.Continue() - self.assertEqual(process.GetState(), lldb.eStateStopped, "We didn't stop") - thread = process.GetSelectedThread() - self.assertEqual(thread.stop_reason, lldb.eStopReasonBreakpoint, "We didn't hit some breakpoint") - self.assertEqual(thread.GetStopReasonDataCount(), 2, "Only hit one breakpoint") - bkpt_no = thread.GetStopReasonDataAtIndex(0) - self.assertEqual(bkpt_no, backstop_bkpt_2.id, "We continued to the right breakpoint") - - - - - diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/a.cpp b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/a.cpp deleted file mode 100644 index b7b702c5d62d..000000000000 --- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/a.cpp +++ /dev/null @@ -1,6 +0,0 @@ -extern int a_has_a_function(); - -int -a_has_a_function() { - return 10; -} diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/b.cpp b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/b.cpp deleted file mode 100644 index 5a347e60db3a..000000000000 --- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/b.cpp +++ /dev/null @@ -1,6 +0,0 @@ -extern int b_has_a_function(); - -int -b_has_a_function() { - return 100; -} diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/main.cpp b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/main.cpp deleted file mode 100644 index 96b1e1df445b..000000000000 --- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/main.cpp +++ /dev/null @@ -1,27 +0,0 @@ -#include "dylib.h" -#include -#include -#include -#include - -int main(int argc, char const *argv[]) { - const char *a_name = "load_a"; - void *a_dylib_handle = NULL; - - a_dylib_handle = dylib_open(a_name); // Set a breakpoint here. - if (a_dylib_handle == NULL) { // Set another here - we should not hit this one - fprintf(stderr, "%s\n", dylib_last_error()); - exit(1); - } - - const char *b_name = "load_b"; - void *b_dylib_handle = NULL; - - b_dylib_handle = dylib_open(b_name); - if (b_dylib_handle == NULL) { // Set a third here - we should not hit this one - fprintf(stderr, "%s\n", dylib_last_error()); - exit(1); - } - - return 0; -} -- GitLab From 19d2c65ddd757997785163709800f837857f686d Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 19 Mar 2021 20:35:17 +0100 Subject: [PATCH 0195/1000] [CodeGen] Don't crash on for loops with cond variables and no increment This looks like an oversight from a875721d8a2d, creating IR that refers to `for.inc` even if it doesn't exist. Differential Revision: https://reviews.llvm.org/D98980 --- clang/lib/CodeGen/CGStmt.cpp | 2 +- clang/test/CodeGenCXX/for-cond-var.cpp | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 6461e2011216..38f3aa941415 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -992,7 +992,7 @@ void CodeGenFunction::EmitForStmt(const ForStmt &S, // We have entered the condition variable's scope, so we're now able to // jump to the continue block. - Continue = getJumpDestInCurrentScope("for.inc"); + Continue = S.getInc() ? getJumpDestInCurrentScope("for.inc") : CondDest; BreakContinueStack.back().ContinueBlock = Continue; } diff --git a/clang/test/CodeGenCXX/for-cond-var.cpp b/clang/test/CodeGenCXX/for-cond-var.cpp index 45b4a82cb905..60e54d4141f7 100644 --- a/clang/test/CodeGenCXX/for-cond-var.cpp +++ b/clang/test/CodeGenCXX/for-cond-var.cpp @@ -123,3 +123,16 @@ void PR49585_break() { // CHECK [[for_end]]: // CHECK: ret void } + +// CHECK: define {{.*}} void @_Z16incless_for_loopv( +void incless_for_loop() { + // CHECK: br label %[[for_cond:.*]] + // CHECK: [[for_cond]]: + // CHECK: br i1 {{.*}}, label %[[for_body:.*]], label %[[for_end:.*]] + // CHECK: [[for_body]]: + // CHECK: br label %[[for_cond]] + // CHECK: [[for_end]]: + // CHECK: ret void + // CHECK: } + for (; int b = 0;) continue; +} -- GitLab From 6327a7cfd734ffe999c631854d8ca07510f9036a Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 19 Mar 2021 20:56:59 +0100 Subject: [PATCH 0196/1000] [mlir][Linalg] Make LLVM_DEBUG region bigger to avoid warnings in Release builds Transforms.cpp:586:16: error: unused variable 'v' [-Werror,-Wunused-variable] for (Value v : operands) ^ --- mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index fef6dd8f996f..965275dc2bcc 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -581,10 +581,12 @@ static AffineMap substitute( auto map = AffineMap::get(dims.size(), symbols.size(), exprs, exprs.front().getContext()); - LLVM_DEBUG(DBGS() << "Map to simplify: " << map << "\n"); - LLVM_DEBUG(DBGS() << "Operands:\n"); - for (Value v : operands) - LLVM_DEBUG(DBGS() << v << "\n"); + LLVM_DEBUG({ + DBGS() << "Map to simplify: " << map << "\n"; + DBGS() << "Operands:\n"; + for (Value v : operands) + DBGS() << v << "\n"; + }); // Pull in affine.apply operations and compose them fully into the // result. -- GitLab From a531bbd9adfc09b2e62ef0097580f1fe1603ca23 Mon Sep 17 00:00:00 2001 From: Butygin Date: Fri, 12 Mar 2021 17:39:43 +0300 Subject: [PATCH 0197/1000] [MLIR] Test pattern benefit sorting between operation specific and operation agnostic patterns. Previously low benefit op-specific patterns never had a chance to match even if high benefit op-agnostic pattern failed to match. This was already fixed upstream, this commit just adds testscase Differential Revision: https://reviews.llvm.org/D98513 --- mlir/unittests/CMakeLists.txt | 1 + mlir/unittests/Rewrite/CMakeLists.txt | 7 ++ mlir/unittests/Rewrite/PatternBenefit.cpp | 78 +++++++++++++++++++++++ 3 files changed, 86 insertions(+) create mode 100644 mlir/unittests/Rewrite/CMakeLists.txt create mode 100644 mlir/unittests/Rewrite/PatternBenefit.cpp diff --git a/mlir/unittests/CMakeLists.txt b/mlir/unittests/CMakeLists.txt index 9dbf3bfb4d4e..a8e9212ee255 100644 --- a/mlir/unittests/CMakeLists.txt +++ b/mlir/unittests/CMakeLists.txt @@ -10,5 +10,6 @@ add_subdirectory(ExecutionEngine) add_subdirectory(Interfaces) add_subdirectory(IR) add_subdirectory(Pass) +add_subdirectory(Rewrite) add_subdirectory(SDBM) add_subdirectory(TableGen) diff --git a/mlir/unittests/Rewrite/CMakeLists.txt b/mlir/unittests/Rewrite/CMakeLists.txt new file mode 100644 index 000000000000..c0df7d4eee85 --- /dev/null +++ b/mlir/unittests/Rewrite/CMakeLists.txt @@ -0,0 +1,7 @@ +add_mlir_unittest(MLIRRewriteTests + PatternBenefit.cpp +) +target_link_libraries(MLIRRewriteTests + PRIVATE + MLIRRewrite + MLIRTransformUtils) diff --git a/mlir/unittests/Rewrite/PatternBenefit.cpp b/mlir/unittests/Rewrite/PatternBenefit.cpp new file mode 100644 index 000000000000..721ec5ecadc3 --- /dev/null +++ b/mlir/unittests/Rewrite/PatternBenefit.cpp @@ -0,0 +1,78 @@ +//===- PatternBenefit.cpp - RewritePattern benefit unit tests -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/IR/PatternMatch.h" +#include "mlir/Rewrite/PatternApplicator.h" +#include "gtest/gtest.h" + +using namespace mlir; + +namespace { +TEST(PatternBenefitTest, BenefitOrder) { + // There was a bug which caused low-benefit op-specific patterns to never be + // called in presence of high-benefit op-agnostic pattern + + MLIRContext context; + + OpBuilder builder(&context); + auto module = ModuleOp::create(builder.getUnknownLoc()); + + struct Pattern1 : public OpRewritePattern { + Pattern1(mlir::MLIRContext *context, bool *called) + : OpRewritePattern(context, /*benefit*/ 1), called(called) {} + + mlir::LogicalResult + matchAndRewrite(ModuleOp /*op*/, + mlir::PatternRewriter & /*rewriter*/) const override { + *called = true; + return failure(); + } + + private: + bool *called; + }; + + struct Pattern2 : public RewritePattern { + Pattern2(bool *called) + : RewritePattern(/*benefit*/ 2, MatchAnyOpTypeTag{}), called(called) {} + + mlir::LogicalResult + matchAndRewrite(Operation * /*op*/, + mlir::PatternRewriter & /*rewriter*/) const override { + *called = true; + return failure(); + } + + private: + bool *called; + }; + + OwningRewritePatternList patterns; + + bool called1 = false; + bool called2 = false; + + patterns.insert(&context, &called1); + patterns.insert(&called2); + + FrozenRewritePatternList frozenPatterns(std::move(patterns)); + PatternApplicator pa(frozenPatterns); + pa.applyDefaultCostModel(); + + class MyPatternRewriter : public PatternRewriter { + public: + MyPatternRewriter(MLIRContext *ctx) : PatternRewriter(ctx) {} + }; + + MyPatternRewriter rewriter(&context); + (void)pa.matchAndRewrite(module, rewriter); + + EXPECT_TRUE(called1); + EXPECT_TRUE(called2); +} +} // namespace -- GitLab From 94c269baf58330a5e303a4f86f64681f2f7a858b Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Wed, 17 Mar 2021 14:53:57 -0700 Subject: [PATCH 0198/1000] [NewPM] Verify LoopAnalysisResults after a loop pass All loop passes should preserve all analyses in LoopAnalysisResults. Add checks for those. Note that due to PR44815, we don't check LAR's ScalarEvolution. Apparently calling SE.verify() can change its results. Only verify MSSA when VerifyMemorySSA, normally it's very expensive. Reviewed By: asbirlea Differential Revision: https://reviews.llvm.org/D98820 --- llvm/lib/Transforms/Scalar/LoopPassManager.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 60a9602096bb..bea938a7a9cc 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/TimeProfiler.h" using namespace llvm; @@ -291,8 +292,15 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, else PI.runAfterPass(*Pass, *L, PassPA); - // FIXME: We should verify the set of analyses relevant to Loop passes - // are preserved. +#ifndef NDEBUG + // LoopAnalysisResults should always be valid. + // Note that we don't LAR.SE.verify() because that can change observed SE + // queries. See PR44815. + LAR.DT.verify(); + LAR.LI.verify(LAR.DT); + if (LAR.MSSA && VerifyMemorySSA) + LAR.MSSA->verifyMemorySSA(); +#endif // If the loop hasn't been deleted, we need to handle invalidation here. if (!Updater.skipCurrentLoop()) -- GitLab From 436c6c9c20cc522c92a923440a5fc509c342a7db Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Fri, 19 Mar 2021 11:57:01 -0700 Subject: [PATCH 0199/1000] NFC: Break up the mlir python bindings into individual sources. * IRModules.cpp -> (IRCore.cpp, IRAffine.cpp, IRAttributes.cpp, IRTypes.cpp). * The individual pieces now compile in the 5-15s range whereas IRModules.cpp was starting to approach a minute (didn't capture a before time). * More fine grained splitting is possible, but this represents the most obvious. Differential Revision: https://reviews.llvm.org/D98978 --- mlir/lib/Bindings/Python/CMakeLists.txt | 5 +- mlir/lib/Bindings/Python/ExecutionEngine.cpp | 2 +- mlir/lib/Bindings/Python/IRAffine.cpp | 781 ++++++ mlir/lib/Bindings/Python/IRAttributes.cpp | 761 +++++ .../Python/{IRModules.cpp => IRCore.cpp} | 2471 ++--------------- .../Python/{IRModules.h => IRModule.h} | 5 +- mlir/lib/Bindings/Python/IRTypes.cpp | 678 +++++ mlir/lib/Bindings/Python/MainModule.cpp | 7 +- mlir/lib/Bindings/Python/Pass.cpp | 2 +- 9 files changed, 2394 insertions(+), 2318 deletions(-) create mode 100644 mlir/lib/Bindings/Python/IRAffine.cpp create mode 100644 mlir/lib/Bindings/Python/IRAttributes.cpp rename mlir/lib/Bindings/Python/{IRModules.cpp => IRCore.cpp} (52%) rename mlir/lib/Bindings/Python/{IRModules.h => IRModule.h} (99%) create mode 100644 mlir/lib/Bindings/Python/IRTypes.cpp diff --git a/mlir/lib/Bindings/Python/CMakeLists.txt b/mlir/lib/Bindings/Python/CMakeLists.txt index 5f042ec57c29..5fefa80398c7 100644 --- a/mlir/lib/Bindings/Python/CMakeLists.txt +++ b/mlir/lib/Bindings/Python/CMakeLists.txt @@ -70,7 +70,10 @@ add_mlir_python_extension(MLIRCoreBindingsPythonExtension _mlir python SOURCES MainModule.cpp - IRModules.cpp + IRAffine.cpp + IRAttributes.cpp + IRCore.cpp + IRTypes.cpp PybindUtils.cpp Pass.cpp ExecutionEngine.cpp diff --git a/mlir/lib/Bindings/Python/ExecutionEngine.cpp b/mlir/lib/Bindings/Python/ExecutionEngine.cpp index f6f52e2e0aae..5ca9b1f68128 100644 --- a/mlir/lib/Bindings/Python/ExecutionEngine.cpp +++ b/mlir/lib/Bindings/Python/ExecutionEngine.cpp @@ -8,7 +8,7 @@ #include "ExecutionEngine.h" -#include "IRModules.h" +#include "IRModule.h" #include "mlir-c/Bindings/Python/Interop.h" #include "mlir-c/ExecutionEngine.h" diff --git a/mlir/lib/Bindings/Python/IRAffine.cpp b/mlir/lib/Bindings/Python/IRAffine.cpp new file mode 100644 index 000000000000..73a57d95e158 --- /dev/null +++ b/mlir/lib/Bindings/Python/IRAffine.cpp @@ -0,0 +1,781 @@ +//===- IRAffine.cpp - Exports 'ir' module affine related bindings ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "IRModule.h" + +#include "PybindUtils.h" + +#include "mlir-c/AffineMap.h" +#include "mlir-c/Bindings/Python/Interop.h" +#include "mlir-c/IntegerSet.h" + +namespace py = pybind11; +using namespace mlir; +using namespace mlir::python; + +using llvm::SmallVector; +using llvm::StringRef; +using llvm::Twine; + +static const char kDumpDocstring[] = + R"(Dumps a debug representation of the object to stderr.)"; + +/// Attempts to populate `result` with the content of `list` casted to the +/// appropriate type (Python and C types are provided as template arguments). +/// Throws errors in case of failure, using "action" to describe what the caller +/// was attempting to do. +template +static void pyListToVector(py::list list, llvm::SmallVectorImpl &result, + StringRef action) { + result.reserve(py::len(list)); + for (py::handle item : list) { + try { + result.push_back(item.cast()); + } catch (py::cast_error &err) { + std::string msg = (llvm::Twine("Invalid expression when ") + action + + " (" + err.what() + ")") + .str(); + throw py::cast_error(msg); + } catch (py::reference_cast_error &err) { + std::string msg = (llvm::Twine("Invalid expression (None?) when ") + + action + " (" + err.what() + ")") + .str(); + throw py::cast_error(msg); + } + } +} + +template +static bool isPermutation(std::vector permutation) { + llvm::SmallVector seen(permutation.size(), false); + for (auto val : permutation) { + if (val < permutation.size()) { + if (seen[val]) + return false; + seen[val] = true; + continue; + } + return false; + } + return true; +} + +namespace { + +/// CRTP base class for Python MLIR affine expressions that subclass AffineExpr +/// and should be castable from it. Intermediate hierarchy classes can be +/// modeled by specifying BaseTy. +template +class PyConcreteAffineExpr : public BaseTy { +public: + // Derived classes must define statics for: + // IsAFunctionTy isaFunction + // const char *pyClassName + // and redefine bindDerived. + using ClassTy = py::class_; + using IsAFunctionTy = bool (*)(MlirAffineExpr); + + PyConcreteAffineExpr() = default; + PyConcreteAffineExpr(PyMlirContextRef contextRef, MlirAffineExpr affineExpr) + : BaseTy(std::move(contextRef), affineExpr) {} + PyConcreteAffineExpr(PyAffineExpr &orig) + : PyConcreteAffineExpr(orig.getContext(), castFrom(orig)) {} + + static MlirAffineExpr castFrom(PyAffineExpr &orig) { + if (!DerivedTy::isaFunction(orig)) { + auto origRepr = py::repr(py::cast(orig)).cast(); + throw SetPyError(PyExc_ValueError, + Twine("Cannot cast affine expression to ") + + DerivedTy::pyClassName + " (from " + origRepr + ")"); + } + return orig; + } + + static void bind(py::module &m) { + auto cls = ClassTy(m, DerivedTy::pyClassName); + cls.def(py::init()); + DerivedTy::bindDerived(cls); + } + + /// Implemented by derived classes to add methods to the Python subclass. + static void bindDerived(ClassTy &m) {} +}; + +class PyAffineConstantExpr : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAConstant; + static constexpr const char *pyClassName = "AffineConstantExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + static PyAffineConstantExpr get(intptr_t value, + DefaultingPyMlirContext context) { + MlirAffineExpr affineExpr = + mlirAffineConstantExprGet(context->get(), static_cast(value)); + return PyAffineConstantExpr(context->getRef(), affineExpr); + } + + static void bindDerived(ClassTy &c) { + c.def_static("get", &PyAffineConstantExpr::get, py::arg("value"), + py::arg("context") = py::none()); + c.def_property_readonly("value", [](PyAffineConstantExpr &self) { + return mlirAffineConstantExprGetValue(self); + }); + } +}; + +class PyAffineDimExpr : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsADim; + static constexpr const char *pyClassName = "AffineDimExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + static PyAffineDimExpr get(intptr_t pos, DefaultingPyMlirContext context) { + MlirAffineExpr affineExpr = mlirAffineDimExprGet(context->get(), pos); + return PyAffineDimExpr(context->getRef(), affineExpr); + } + + static void bindDerived(ClassTy &c) { + c.def_static("get", &PyAffineDimExpr::get, py::arg("position"), + py::arg("context") = py::none()); + c.def_property_readonly("position", [](PyAffineDimExpr &self) { + return mlirAffineDimExprGetPosition(self); + }); + } +}; + +class PyAffineSymbolExpr : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsASymbol; + static constexpr const char *pyClassName = "AffineSymbolExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + static PyAffineSymbolExpr get(intptr_t pos, DefaultingPyMlirContext context) { + MlirAffineExpr affineExpr = mlirAffineSymbolExprGet(context->get(), pos); + return PyAffineSymbolExpr(context->getRef(), affineExpr); + } + + static void bindDerived(ClassTy &c) { + c.def_static("get", &PyAffineSymbolExpr::get, py::arg("position"), + py::arg("context") = py::none()); + c.def_property_readonly("position", [](PyAffineSymbolExpr &self) { + return mlirAffineSymbolExprGetPosition(self); + }); + } +}; + +class PyAffineBinaryExpr : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsABinary; + static constexpr const char *pyClassName = "AffineBinaryExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + PyAffineExpr lhs() { + MlirAffineExpr lhsExpr = mlirAffineBinaryOpExprGetLHS(get()); + return PyAffineExpr(getContext(), lhsExpr); + } + + PyAffineExpr rhs() { + MlirAffineExpr rhsExpr = mlirAffineBinaryOpExprGetRHS(get()); + return PyAffineExpr(getContext(), rhsExpr); + } + + static void bindDerived(ClassTy &c) { + c.def_property_readonly("lhs", &PyAffineBinaryExpr::lhs); + c.def_property_readonly("rhs", &PyAffineBinaryExpr::rhs); + } +}; + +class PyAffineAddExpr + : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAAdd; + static constexpr const char *pyClassName = "AffineAddExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + static PyAffineAddExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { + MlirAffineExpr expr = mlirAffineAddExprGet(lhs, rhs); + return PyAffineAddExpr(lhs.getContext(), expr); + } + + static void bindDerived(ClassTy &c) { + c.def_static("get", &PyAffineAddExpr::get); + } +}; + +class PyAffineMulExpr + : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAMul; + static constexpr const char *pyClassName = "AffineMulExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + static PyAffineMulExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { + MlirAffineExpr expr = mlirAffineMulExprGet(lhs, rhs); + return PyAffineMulExpr(lhs.getContext(), expr); + } + + static void bindDerived(ClassTy &c) { + c.def_static("get", &PyAffineMulExpr::get); + } +}; + +class PyAffineModExpr + : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAMod; + static constexpr const char *pyClassName = "AffineModExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + static PyAffineModExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { + MlirAffineExpr expr = mlirAffineModExprGet(lhs, rhs); + return PyAffineModExpr(lhs.getContext(), expr); + } + + static void bindDerived(ClassTy &c) { + c.def_static("get", &PyAffineModExpr::get); + } +}; + +class PyAffineFloorDivExpr + : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAFloorDiv; + static constexpr const char *pyClassName = "AffineFloorDivExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + static PyAffineFloorDivExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { + MlirAffineExpr expr = mlirAffineFloorDivExprGet(lhs, rhs); + return PyAffineFloorDivExpr(lhs.getContext(), expr); + } + + static void bindDerived(ClassTy &c) { + c.def_static("get", &PyAffineFloorDivExpr::get); + } +}; + +class PyAffineCeilDivExpr + : public PyConcreteAffineExpr { +public: + static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsACeilDiv; + static constexpr const char *pyClassName = "AffineCeilDivExpr"; + using PyConcreteAffineExpr::PyConcreteAffineExpr; + + static PyAffineCeilDivExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { + MlirAffineExpr expr = mlirAffineCeilDivExprGet(lhs, rhs); + return PyAffineCeilDivExpr(lhs.getContext(), expr); + } + + static void bindDerived(ClassTy &c) { + c.def_static("get", &PyAffineCeilDivExpr::get); + } +}; + +} // namespace + +bool PyAffineExpr::operator==(const PyAffineExpr &other) { + return mlirAffineExprEqual(affineExpr, other.affineExpr); +} + +py::object PyAffineExpr::getCapsule() { + return py::reinterpret_steal( + mlirPythonAffineExprToCapsule(*this)); +} + +PyAffineExpr PyAffineExpr::createFromCapsule(py::object capsule) { + MlirAffineExpr rawAffineExpr = mlirPythonCapsuleToAffineExpr(capsule.ptr()); + if (mlirAffineExprIsNull(rawAffineExpr)) + throw py::error_already_set(); + return PyAffineExpr( + PyMlirContext::forContext(mlirAffineExprGetContext(rawAffineExpr)), + rawAffineExpr); +} + +//------------------------------------------------------------------------------ +// PyAffineMap and utilities. +//------------------------------------------------------------------------------ +namespace { + +/// A list of expressions contained in an affine map. Internally these are +/// stored as a consecutive array leading to inexpensive random access. Both +/// the map and the expression are owned by the context so we need not bother +/// with lifetime extension. +class PyAffineMapExprList + : public Sliceable { +public: + static constexpr const char *pyClassName = "AffineExprList"; + + PyAffineMapExprList(PyAffineMap map, intptr_t startIndex = 0, + intptr_t length = -1, intptr_t step = 1) + : Sliceable(startIndex, + length == -1 ? mlirAffineMapGetNumResults(map) : length, + step), + affineMap(map) {} + + intptr_t getNumElements() { return mlirAffineMapGetNumResults(affineMap); } + + PyAffineExpr getElement(intptr_t pos) { + return PyAffineExpr(affineMap.getContext(), + mlirAffineMapGetResult(affineMap, pos)); + } + + PyAffineMapExprList slice(intptr_t startIndex, intptr_t length, + intptr_t step) { + return PyAffineMapExprList(affineMap, startIndex, length, step); + } + +private: + PyAffineMap affineMap; +}; +} // end namespace + +bool PyAffineMap::operator==(const PyAffineMap &other) { + return mlirAffineMapEqual(affineMap, other.affineMap); +} + +py::object PyAffineMap::getCapsule() { + return py::reinterpret_steal(mlirPythonAffineMapToCapsule(*this)); +} + +PyAffineMap PyAffineMap::createFromCapsule(py::object capsule) { + MlirAffineMap rawAffineMap = mlirPythonCapsuleToAffineMap(capsule.ptr()); + if (mlirAffineMapIsNull(rawAffineMap)) + throw py::error_already_set(); + return PyAffineMap( + PyMlirContext::forContext(mlirAffineMapGetContext(rawAffineMap)), + rawAffineMap); +} + +//------------------------------------------------------------------------------ +// PyIntegerSet and utilities. +//------------------------------------------------------------------------------ +namespace { + +class PyIntegerSetConstraint { +public: + PyIntegerSetConstraint(PyIntegerSet set, intptr_t pos) : set(set), pos(pos) {} + + PyAffineExpr getExpr() { + return PyAffineExpr(set.getContext(), + mlirIntegerSetGetConstraint(set, pos)); + } + + bool isEq() { return mlirIntegerSetIsConstraintEq(set, pos); } + + static void bind(py::module &m) { + py::class_(m, "IntegerSetConstraint") + .def_property_readonly("expr", &PyIntegerSetConstraint::getExpr) + .def_property_readonly("is_eq", &PyIntegerSetConstraint::isEq); + } + +private: + PyIntegerSet set; + intptr_t pos; +}; + +class PyIntegerSetConstraintList + : public Sliceable { +public: + static constexpr const char *pyClassName = "IntegerSetConstraintList"; + + PyIntegerSetConstraintList(PyIntegerSet set, intptr_t startIndex = 0, + intptr_t length = -1, intptr_t step = 1) + : Sliceable(startIndex, + length == -1 ? mlirIntegerSetGetNumConstraints(set) : length, + step), + set(set) {} + + intptr_t getNumElements() { return mlirIntegerSetGetNumConstraints(set); } + + PyIntegerSetConstraint getElement(intptr_t pos) { + return PyIntegerSetConstraint(set, pos); + } + + PyIntegerSetConstraintList slice(intptr_t startIndex, intptr_t length, + intptr_t step) { + return PyIntegerSetConstraintList(set, startIndex, length, step); + } + +private: + PyIntegerSet set; +}; +} // namespace + +bool PyIntegerSet::operator==(const PyIntegerSet &other) { + return mlirIntegerSetEqual(integerSet, other.integerSet); +} + +py::object PyIntegerSet::getCapsule() { + return py::reinterpret_steal( + mlirPythonIntegerSetToCapsule(*this)); +} + +PyIntegerSet PyIntegerSet::createFromCapsule(py::object capsule) { + MlirIntegerSet rawIntegerSet = mlirPythonCapsuleToIntegerSet(capsule.ptr()); + if (mlirIntegerSetIsNull(rawIntegerSet)) + throw py::error_already_set(); + return PyIntegerSet( + PyMlirContext::forContext(mlirIntegerSetGetContext(rawIntegerSet)), + rawIntegerSet); +} + +void mlir::python::populateIRAffine(py::module &m) { + //---------------------------------------------------------------------------- + // Mapping of PyAffineExpr and derived classes. + //---------------------------------------------------------------------------- + py::class_(m, "AffineExpr") + .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, + &PyAffineExpr::getCapsule) + .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyAffineExpr::createFromCapsule) + .def("__add__", + [](PyAffineExpr &self, PyAffineExpr &other) { + return PyAffineAddExpr::get(self, other); + }) + .def("__mul__", + [](PyAffineExpr &self, PyAffineExpr &other) { + return PyAffineMulExpr::get(self, other); + }) + .def("__mod__", + [](PyAffineExpr &self, PyAffineExpr &other) { + return PyAffineModExpr::get(self, other); + }) + .def("__sub__", + [](PyAffineExpr &self, PyAffineExpr &other) { + auto negOne = + PyAffineConstantExpr::get(-1, *self.getContext().get()); + return PyAffineAddExpr::get(self, + PyAffineMulExpr::get(negOne, other)); + }) + .def("__eq__", [](PyAffineExpr &self, + PyAffineExpr &other) { return self == other; }) + .def("__eq__", + [](PyAffineExpr &self, py::object &other) { return false; }) + .def("__str__", + [](PyAffineExpr &self) { + PyPrintAccumulator printAccum; + mlirAffineExprPrint(self, printAccum.getCallback(), + printAccum.getUserData()); + return printAccum.join(); + }) + .def("__repr__", + [](PyAffineExpr &self) { + PyPrintAccumulator printAccum; + printAccum.parts.append("AffineExpr("); + mlirAffineExprPrint(self, printAccum.getCallback(), + printAccum.getUserData()); + printAccum.parts.append(")"); + return printAccum.join(); + }) + .def_property_readonly( + "context", + [](PyAffineExpr &self) { return self.getContext().getObject(); }) + .def_static( + "get_add", &PyAffineAddExpr::get, + "Gets an affine expression containing a sum of two expressions.") + .def_static( + "get_mul", &PyAffineMulExpr::get, + "Gets an affine expression containing a product of two expressions.") + .def_static("get_mod", &PyAffineModExpr::get, + "Gets an affine expression containing the modulo of dividing " + "one expression by another.") + .def_static("get_floor_div", &PyAffineFloorDivExpr::get, + "Gets an affine expression containing the rounded-down " + "result of dividing one expression by another.") + .def_static("get_ceil_div", &PyAffineCeilDivExpr::get, + "Gets an affine expression containing the rounded-up result " + "of dividing one expression by another.") + .def_static("get_constant", &PyAffineConstantExpr::get, py::arg("value"), + py::arg("context") = py::none(), + "Gets a constant affine expression with the given value.") + .def_static( + "get_dim", &PyAffineDimExpr::get, py::arg("position"), + py::arg("context") = py::none(), + "Gets an affine expression of a dimension at the given position.") + .def_static( + "get_symbol", &PyAffineSymbolExpr::get, py::arg("position"), + py::arg("context") = py::none(), + "Gets an affine expression of a symbol at the given position.") + .def( + "dump", [](PyAffineExpr &self) { mlirAffineExprDump(self); }, + kDumpDocstring); + PyAffineConstantExpr::bind(m); + PyAffineDimExpr::bind(m); + PyAffineSymbolExpr::bind(m); + PyAffineBinaryExpr::bind(m); + PyAffineAddExpr::bind(m); + PyAffineMulExpr::bind(m); + PyAffineModExpr::bind(m); + PyAffineFloorDivExpr::bind(m); + PyAffineCeilDivExpr::bind(m); + + //---------------------------------------------------------------------------- + // Mapping of PyAffineMap. + //---------------------------------------------------------------------------- + py::class_(m, "AffineMap") + .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, + &PyAffineMap::getCapsule) + .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyAffineMap::createFromCapsule) + .def("__eq__", + [](PyAffineMap &self, PyAffineMap &other) { return self == other; }) + .def("__eq__", [](PyAffineMap &self, py::object &other) { return false; }) + .def("__str__", + [](PyAffineMap &self) { + PyPrintAccumulator printAccum; + mlirAffineMapPrint(self, printAccum.getCallback(), + printAccum.getUserData()); + return printAccum.join(); + }) + .def("__repr__", + [](PyAffineMap &self) { + PyPrintAccumulator printAccum; + printAccum.parts.append("AffineMap("); + mlirAffineMapPrint(self, printAccum.getCallback(), + printAccum.getUserData()); + printAccum.parts.append(")"); + return printAccum.join(); + }) + .def_property_readonly( + "context", + [](PyAffineMap &self) { return self.getContext().getObject(); }, + "Context that owns the Affine Map") + .def( + "dump", [](PyAffineMap &self) { mlirAffineMapDump(self); }, + kDumpDocstring) + .def_static( + "get", + [](intptr_t dimCount, intptr_t symbolCount, py::list exprs, + DefaultingPyMlirContext context) { + SmallVector affineExprs; + pyListToVector( + exprs, affineExprs, "attempting to create an AffineMap"); + MlirAffineMap map = + mlirAffineMapGet(context->get(), dimCount, symbolCount, + affineExprs.size(), affineExprs.data()); + return PyAffineMap(context->getRef(), map); + }, + py::arg("dim_count"), py::arg("symbol_count"), py::arg("exprs"), + py::arg("context") = py::none(), + "Gets a map with the given expressions as results.") + .def_static( + "get_constant", + [](intptr_t value, DefaultingPyMlirContext context) { + MlirAffineMap affineMap = + mlirAffineMapConstantGet(context->get(), value); + return PyAffineMap(context->getRef(), affineMap); + }, + py::arg("value"), py::arg("context") = py::none(), + "Gets an affine map with a single constant result") + .def_static( + "get_empty", + [](DefaultingPyMlirContext context) { + MlirAffineMap affineMap = mlirAffineMapEmptyGet(context->get()); + return PyAffineMap(context->getRef(), affineMap); + }, + py::arg("context") = py::none(), "Gets an empty affine map.") + .def_static( + "get_identity", + [](intptr_t nDims, DefaultingPyMlirContext context) { + MlirAffineMap affineMap = + mlirAffineMapMultiDimIdentityGet(context->get(), nDims); + return PyAffineMap(context->getRef(), affineMap); + }, + py::arg("n_dims"), py::arg("context") = py::none(), + "Gets an identity map with the given number of dimensions.") + .def_static( + "get_minor_identity", + [](intptr_t nDims, intptr_t nResults, + DefaultingPyMlirContext context) { + MlirAffineMap affineMap = + mlirAffineMapMinorIdentityGet(context->get(), nDims, nResults); + return PyAffineMap(context->getRef(), affineMap); + }, + py::arg("n_dims"), py::arg("n_results"), + py::arg("context") = py::none(), + "Gets a minor identity map with the given number of dimensions and " + "results.") + .def_static( + "get_permutation", + [](std::vector permutation, + DefaultingPyMlirContext context) { + if (!isPermutation(permutation)) + throw py::cast_error("Invalid permutation when attempting to " + "create an AffineMap"); + MlirAffineMap affineMap = mlirAffineMapPermutationGet( + context->get(), permutation.size(), permutation.data()); + return PyAffineMap(context->getRef(), affineMap); + }, + py::arg("permutation"), py::arg("context") = py::none(), + "Gets an affine map that permutes its inputs.") + .def("get_submap", + [](PyAffineMap &self, std::vector &resultPos) { + intptr_t numResults = mlirAffineMapGetNumResults(self); + for (intptr_t pos : resultPos) { + if (pos < 0 || pos >= numResults) + throw py::value_error("result position out of bounds"); + } + MlirAffineMap affineMap = mlirAffineMapGetSubMap( + self, resultPos.size(), resultPos.data()); + return PyAffineMap(self.getContext(), affineMap); + }) + .def("get_major_submap", + [](PyAffineMap &self, intptr_t nResults) { + if (nResults >= mlirAffineMapGetNumResults(self)) + throw py::value_error("number of results out of bounds"); + MlirAffineMap affineMap = + mlirAffineMapGetMajorSubMap(self, nResults); + return PyAffineMap(self.getContext(), affineMap); + }) + .def("get_minor_submap", + [](PyAffineMap &self, intptr_t nResults) { + if (nResults >= mlirAffineMapGetNumResults(self)) + throw py::value_error("number of results out of bounds"); + MlirAffineMap affineMap = + mlirAffineMapGetMinorSubMap(self, nResults); + return PyAffineMap(self.getContext(), affineMap); + }) + .def_property_readonly( + "is_permutation", + [](PyAffineMap &self) { return mlirAffineMapIsPermutation(self); }) + .def_property_readonly("is_projected_permutation", + [](PyAffineMap &self) { + return mlirAffineMapIsProjectedPermutation(self); + }) + .def_property_readonly( + "n_dims", + [](PyAffineMap &self) { return mlirAffineMapGetNumDims(self); }) + .def_property_readonly( + "n_inputs", + [](PyAffineMap &self) { return mlirAffineMapGetNumInputs(self); }) + .def_property_readonly( + "n_symbols", + [](PyAffineMap &self) { return mlirAffineMapGetNumSymbols(self); }) + .def_property_readonly("results", [](PyAffineMap &self) { + return PyAffineMapExprList(self); + }); + PyAffineMapExprList::bind(m); + + //---------------------------------------------------------------------------- + // Mapping of PyIntegerSet. + //---------------------------------------------------------------------------- + py::class_(m, "IntegerSet") + .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, + &PyIntegerSet::getCapsule) + .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyIntegerSet::createFromCapsule) + .def("__eq__", [](PyIntegerSet &self, + PyIntegerSet &other) { return self == other; }) + .def("__eq__", [](PyIntegerSet &self, py::object other) { return false; }) + .def("__str__", + [](PyIntegerSet &self) { + PyPrintAccumulator printAccum; + mlirIntegerSetPrint(self, printAccum.getCallback(), + printAccum.getUserData()); + return printAccum.join(); + }) + .def("__repr__", + [](PyIntegerSet &self) { + PyPrintAccumulator printAccum; + printAccum.parts.append("IntegerSet("); + mlirIntegerSetPrint(self, printAccum.getCallback(), + printAccum.getUserData()); + printAccum.parts.append(")"); + return printAccum.join(); + }) + .def_property_readonly( + "context", + [](PyIntegerSet &self) { return self.getContext().getObject(); }) + .def( + "dump", [](PyIntegerSet &self) { mlirIntegerSetDump(self); }, + kDumpDocstring) + .def_static( + "get", + [](intptr_t numDims, intptr_t numSymbols, py::list exprs, + std::vector eqFlags, DefaultingPyMlirContext context) { + if (exprs.size() != eqFlags.size()) + throw py::value_error( + "Expected the number of constraints to match " + "that of equality flags"); + if (exprs.empty()) + throw py::value_error("Expected non-empty list of constraints"); + + // Copy over to a SmallVector because std::vector has a + // specialization for booleans that packs data and does not + // expose a `bool *`. + SmallVector flags(eqFlags.begin(), eqFlags.end()); + + SmallVector affineExprs; + pyListToVector(exprs, affineExprs, + "attempting to create an IntegerSet"); + MlirIntegerSet set = mlirIntegerSetGet( + context->get(), numDims, numSymbols, exprs.size(), + affineExprs.data(), flags.data()); + return PyIntegerSet(context->getRef(), set); + }, + py::arg("num_dims"), py::arg("num_symbols"), py::arg("exprs"), + py::arg("eq_flags"), py::arg("context") = py::none()) + .def_static( + "get_empty", + [](intptr_t numDims, intptr_t numSymbols, + DefaultingPyMlirContext context) { + MlirIntegerSet set = + mlirIntegerSetEmptyGet(context->get(), numDims, numSymbols); + return PyIntegerSet(context->getRef(), set); + }, + py::arg("num_dims"), py::arg("num_symbols"), + py::arg("context") = py::none()) + .def("get_replaced", + [](PyIntegerSet &self, py::list dimExprs, py::list symbolExprs, + intptr_t numResultDims, intptr_t numResultSymbols) { + if (static_cast(dimExprs.size()) != + mlirIntegerSetGetNumDims(self)) + throw py::value_error( + "Expected the number of dimension replacement expressions " + "to match that of dimensions"); + if (static_cast(symbolExprs.size()) != + mlirIntegerSetGetNumSymbols(self)) + throw py::value_error( + "Expected the number of symbol replacement expressions " + "to match that of symbols"); + + SmallVector dimAffineExprs, symbolAffineExprs; + pyListToVector( + dimExprs, dimAffineExprs, + "attempting to create an IntegerSet by replacing dimensions"); + pyListToVector( + symbolExprs, symbolAffineExprs, + "attempting to create an IntegerSet by replacing symbols"); + MlirIntegerSet set = mlirIntegerSetReplaceGet( + self, dimAffineExprs.data(), symbolAffineExprs.data(), + numResultDims, numResultSymbols); + return PyIntegerSet(self.getContext(), set); + }) + .def_property_readonly("is_canonical_empty", + [](PyIntegerSet &self) { + return mlirIntegerSetIsCanonicalEmpty(self); + }) + .def_property_readonly( + "n_dims", + [](PyIntegerSet &self) { return mlirIntegerSetGetNumDims(self); }) + .def_property_readonly( + "n_symbols", + [](PyIntegerSet &self) { return mlirIntegerSetGetNumSymbols(self); }) + .def_property_readonly( + "n_inputs", + [](PyIntegerSet &self) { return mlirIntegerSetGetNumInputs(self); }) + .def_property_readonly("n_equalities", + [](PyIntegerSet &self) { + return mlirIntegerSetGetNumEqualities(self); + }) + .def_property_readonly("n_inequalities", + [](PyIntegerSet &self) { + return mlirIntegerSetGetNumInequalities(self); + }) + .def_property_readonly("constraints", [](PyIntegerSet &self) { + return PyIntegerSetConstraintList(self); + }); + PyIntegerSetConstraint::bind(m); + PyIntegerSetConstraintList::bind(m); +} diff --git a/mlir/lib/Bindings/Python/IRAttributes.cpp b/mlir/lib/Bindings/Python/IRAttributes.cpp new file mode 100644 index 000000000000..6f9206c1b912 --- /dev/null +++ b/mlir/lib/Bindings/Python/IRAttributes.cpp @@ -0,0 +1,761 @@ +//===- IRAttributes.cpp - Exports builtin and standard attributes ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "IRModule.h" + +#include "PybindUtils.h" + +#include "mlir-c/BuiltinAttributes.h" +#include "mlir-c/BuiltinTypes.h" + +namespace py = pybind11; +using namespace mlir; +using namespace mlir::python; + +using llvm::SmallVector; +using llvm::StringRef; +using llvm::Twine; + +namespace { + +static MlirStringRef toMlirStringRef(const std::string &s) { + return mlirStringRefCreate(s.data(), s.size()); +} + +/// CRTP base classes for Python attributes that subclass Attribute and should +/// be castable from it (i.e. via something like StringAttr(attr)). +/// By default, attribute class hierarchies are one level deep (i.e. a +/// concrete attribute class extends PyAttribute); however, intermediate +/// python-visible base classes can be modeled by specifying a BaseTy. +template +class PyConcreteAttribute : public BaseTy { +public: + // Derived classes must define statics for: + // IsAFunctionTy isaFunction + // const char *pyClassName + using ClassTy = py::class_; + using IsAFunctionTy = bool (*)(MlirAttribute); + + PyConcreteAttribute() = default; + PyConcreteAttribute(PyMlirContextRef contextRef, MlirAttribute attr) + : BaseTy(std::move(contextRef), attr) {} + PyConcreteAttribute(PyAttribute &orig) + : PyConcreteAttribute(orig.getContext(), castFrom(orig)) {} + + static MlirAttribute castFrom(PyAttribute &orig) { + if (!DerivedTy::isaFunction(orig)) { + auto origRepr = py::repr(py::cast(orig)).cast(); + throw SetPyError(PyExc_ValueError, Twine("Cannot cast attribute to ") + + DerivedTy::pyClassName + + " (from " + origRepr + ")"); + } + return orig; + } + + static void bind(py::module &m) { + auto cls = ClassTy(m, DerivedTy::pyClassName, py::buffer_protocol()); + cls.def(py::init(), py::keep_alive<0, 1>()); + DerivedTy::bindDerived(cls); + } + + /// Implemented by derived classes to add methods to the Python subclass. + static void bindDerived(ClassTy &m) {} +}; + +class PyAffineMapAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAAffineMap; + static constexpr const char *pyClassName = "AffineMapAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](PyAffineMap &affineMap) { + MlirAttribute attr = mlirAffineMapAttrGet(affineMap.get()); + return PyAffineMapAttribute(affineMap.getContext(), attr); + }, + py::arg("affine_map"), "Gets an attribute wrapping an AffineMap."); + } +}; + +class PyArrayAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAArray; + static constexpr const char *pyClassName = "ArrayAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + class PyArrayAttributeIterator { + public: + PyArrayAttributeIterator(PyAttribute attr) : attr(attr) {} + + PyArrayAttributeIterator &dunderIter() { return *this; } + + PyAttribute dunderNext() { + if (nextIndex >= mlirArrayAttrGetNumElements(attr.get())) { + throw py::stop_iteration(); + } + return PyAttribute(attr.getContext(), + mlirArrayAttrGetElement(attr.get(), nextIndex++)); + } + + static void bind(py::module &m) { + py::class_(m, "ArrayAttributeIterator") + .def("__iter__", &PyArrayAttributeIterator::dunderIter) + .def("__next__", &PyArrayAttributeIterator::dunderNext); + } + + private: + PyAttribute attr; + int nextIndex = 0; + }; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](py::list attributes, DefaultingPyMlirContext context) { + SmallVector mlirAttributes; + mlirAttributes.reserve(py::len(attributes)); + for (auto attribute : attributes) { + try { + mlirAttributes.push_back(attribute.cast()); + } catch (py::cast_error &err) { + std::string msg = std::string("Invalid attribute when attempting " + "to create an ArrayAttribute (") + + err.what() + ")"; + throw py::cast_error(msg); + } catch (py::reference_cast_error &err) { + // This exception seems thrown when the value is "None". + std::string msg = + std::string("Invalid attribute (None?) when attempting to " + "create an ArrayAttribute (") + + err.what() + ")"; + throw py::cast_error(msg); + } + } + MlirAttribute attr = mlirArrayAttrGet( + context->get(), mlirAttributes.size(), mlirAttributes.data()); + return PyArrayAttribute(context->getRef(), attr); + }, + py::arg("attributes"), py::arg("context") = py::none(), + "Gets a uniqued Array attribute"); + c.def("__getitem__", + [](PyArrayAttribute &arr, intptr_t i) { + if (i >= mlirArrayAttrGetNumElements(arr)) + throw py::index_error("ArrayAttribute index out of range"); + return PyAttribute(arr.getContext(), + mlirArrayAttrGetElement(arr, i)); + }) + .def("__len__", + [](const PyArrayAttribute &arr) { + return mlirArrayAttrGetNumElements(arr); + }) + .def("__iter__", [](const PyArrayAttribute &arr) { + return PyArrayAttributeIterator(arr); + }); + } +}; + +/// Float Point Attribute subclass - FloatAttr. +class PyFloatAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAFloat; + static constexpr const char *pyClassName = "FloatAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](PyType &type, double value, DefaultingPyLocation loc) { + MlirAttribute attr = mlirFloatAttrDoubleGetChecked(loc, type, value); + // TODO: Rework error reporting once diagnostic engine is exposed + // in C API. + if (mlirAttributeIsNull(attr)) { + throw SetPyError(PyExc_ValueError, + Twine("invalid '") + + py::repr(py::cast(type)).cast() + + "' and expected floating point type."); + } + return PyFloatAttribute(type.getContext(), attr); + }, + py::arg("type"), py::arg("value"), py::arg("loc") = py::none(), + "Gets an uniqued float point attribute associated to a type"); + c.def_static( + "get_f32", + [](double value, DefaultingPyMlirContext context) { + MlirAttribute attr = mlirFloatAttrDoubleGet( + context->get(), mlirF32TypeGet(context->get()), value); + return PyFloatAttribute(context->getRef(), attr); + }, + py::arg("value"), py::arg("context") = py::none(), + "Gets an uniqued float point attribute associated to a f32 type"); + c.def_static( + "get_f64", + [](double value, DefaultingPyMlirContext context) { + MlirAttribute attr = mlirFloatAttrDoubleGet( + context->get(), mlirF64TypeGet(context->get()), value); + return PyFloatAttribute(context->getRef(), attr); + }, + py::arg("value"), py::arg("context") = py::none(), + "Gets an uniqued float point attribute associated to a f64 type"); + c.def_property_readonly( + "value", + [](PyFloatAttribute &self) { + return mlirFloatAttrGetValueDouble(self); + }, + "Returns the value of the float point attribute"); + } +}; + +/// Integer Attribute subclass - IntegerAttr. +class PyIntegerAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAInteger; + static constexpr const char *pyClassName = "IntegerAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](PyType &type, int64_t value) { + MlirAttribute attr = mlirIntegerAttrGet(type, value); + return PyIntegerAttribute(type.getContext(), attr); + }, + py::arg("type"), py::arg("value"), + "Gets an uniqued integer attribute associated to a type"); + c.def_property_readonly( + "value", + [](PyIntegerAttribute &self) { + return mlirIntegerAttrGetValueInt(self); + }, + "Returns the value of the integer attribute"); + } +}; + +/// Bool Attribute subclass - BoolAttr. +class PyBoolAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsABool; + static constexpr const char *pyClassName = "BoolAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](bool value, DefaultingPyMlirContext context) { + MlirAttribute attr = mlirBoolAttrGet(context->get(), value); + return PyBoolAttribute(context->getRef(), attr); + }, + py::arg("value"), py::arg("context") = py::none(), + "Gets an uniqued bool attribute"); + c.def_property_readonly( + "value", + [](PyBoolAttribute &self) { return mlirBoolAttrGetValue(self); }, + "Returns the value of the bool attribute"); + } +}; + +class PyFlatSymbolRefAttribute + : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAFlatSymbolRef; + static constexpr const char *pyClassName = "FlatSymbolRefAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](std::string value, DefaultingPyMlirContext context) { + MlirAttribute attr = + mlirFlatSymbolRefAttrGet(context->get(), toMlirStringRef(value)); + return PyFlatSymbolRefAttribute(context->getRef(), attr); + }, + py::arg("value"), py::arg("context") = py::none(), + "Gets a uniqued FlatSymbolRef attribute"); + c.def_property_readonly( + "value", + [](PyFlatSymbolRefAttribute &self) { + MlirStringRef stringRef = mlirFlatSymbolRefAttrGetValue(self); + return py::str(stringRef.data, stringRef.length); + }, + "Returns the value of the FlatSymbolRef attribute as a string"); + } +}; + +class PyStringAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAString; + static constexpr const char *pyClassName = "StringAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](std::string value, DefaultingPyMlirContext context) { + MlirAttribute attr = + mlirStringAttrGet(context->get(), toMlirStringRef(value)); + return PyStringAttribute(context->getRef(), attr); + }, + py::arg("value"), py::arg("context") = py::none(), + "Gets a uniqued string attribute"); + c.def_static( + "get_typed", + [](PyType &type, std::string value) { + MlirAttribute attr = + mlirStringAttrTypedGet(type, toMlirStringRef(value)); + return PyStringAttribute(type.getContext(), attr); + }, + + "Gets a uniqued string attribute associated to a type"); + c.def_property_readonly( + "value", + [](PyStringAttribute &self) { + MlirStringRef stringRef = mlirStringAttrGetValue(self); + return py::str(stringRef.data, stringRef.length); + }, + "Returns the value of the string attribute"); + } +}; + +// TODO: Support construction of bool elements. +// TODO: Support construction of string elements. +class PyDenseElementsAttribute + : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsADenseElements; + static constexpr const char *pyClassName = "DenseElementsAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static PyDenseElementsAttribute + getFromBuffer(py::buffer array, bool signless, + DefaultingPyMlirContext contextWrapper) { + // Request a contiguous view. In exotic cases, this will cause a copy. + int flags = PyBUF_C_CONTIGUOUS | PyBUF_FORMAT; + Py_buffer *view = new Py_buffer(); + if (PyObject_GetBuffer(array.ptr(), view, flags) != 0) { + delete view; + throw py::error_already_set(); + } + py::buffer_info arrayInfo(view); + + MlirContext context = contextWrapper->get(); + // Switch on the types that can be bulk loaded between the Python and + // MLIR-C APIs. + // See: https://docs.python.org/3/library/struct.html#format-characters + if (arrayInfo.format == "f") { + // f32 + assert(arrayInfo.itemsize == 4 && "mismatched array itemsize"); + return PyDenseElementsAttribute( + contextWrapper->getRef(), + bulkLoad(context, mlirDenseElementsAttrFloatGet, + mlirF32TypeGet(context), arrayInfo)); + } else if (arrayInfo.format == "d") { + // f64 + assert(arrayInfo.itemsize == 8 && "mismatched array itemsize"); + return PyDenseElementsAttribute( + contextWrapper->getRef(), + bulkLoad(context, mlirDenseElementsAttrDoubleGet, + mlirF64TypeGet(context), arrayInfo)); + } else if (isSignedIntegerFormat(arrayInfo.format)) { + if (arrayInfo.itemsize == 4) { + // i32 + MlirType elementType = signless ? mlirIntegerTypeGet(context, 32) + : mlirIntegerTypeSignedGet(context, 32); + return PyDenseElementsAttribute(contextWrapper->getRef(), + bulkLoad(context, + mlirDenseElementsAttrInt32Get, + elementType, arrayInfo)); + } else if (arrayInfo.itemsize == 8) { + // i64 + MlirType elementType = signless ? mlirIntegerTypeGet(context, 64) + : mlirIntegerTypeSignedGet(context, 64); + return PyDenseElementsAttribute(contextWrapper->getRef(), + bulkLoad(context, + mlirDenseElementsAttrInt64Get, + elementType, arrayInfo)); + } + } else if (isUnsignedIntegerFormat(arrayInfo.format)) { + if (arrayInfo.itemsize == 4) { + // unsigned i32 + MlirType elementType = signless + ? mlirIntegerTypeGet(context, 32) + : mlirIntegerTypeUnsignedGet(context, 32); + return PyDenseElementsAttribute(contextWrapper->getRef(), + bulkLoad(context, + mlirDenseElementsAttrUInt32Get, + elementType, arrayInfo)); + } else if (arrayInfo.itemsize == 8) { + // unsigned i64 + MlirType elementType = signless + ? mlirIntegerTypeGet(context, 64) + : mlirIntegerTypeUnsignedGet(context, 64); + return PyDenseElementsAttribute(contextWrapper->getRef(), + bulkLoad(context, + mlirDenseElementsAttrUInt64Get, + elementType, arrayInfo)); + } + } + + // TODO: Fall back to string-based get. + std::string message = "unimplemented array format conversion from format: "; + message.append(arrayInfo.format); + throw SetPyError(PyExc_ValueError, message); + } + + static PyDenseElementsAttribute getSplat(PyType shapedType, + PyAttribute &elementAttr) { + auto contextWrapper = + PyMlirContext::forContext(mlirTypeGetContext(shapedType)); + if (!mlirAttributeIsAInteger(elementAttr) && + !mlirAttributeIsAFloat(elementAttr)) { + std::string message = "Illegal element type for DenseElementsAttr: "; + message.append(py::repr(py::cast(elementAttr))); + throw SetPyError(PyExc_ValueError, message); + } + if (!mlirTypeIsAShaped(shapedType) || + !mlirShapedTypeHasStaticShape(shapedType)) { + std::string message = + "Expected a static ShapedType for the shaped_type parameter: "; + message.append(py::repr(py::cast(shapedType))); + throw SetPyError(PyExc_ValueError, message); + } + MlirType shapedElementType = mlirShapedTypeGetElementType(shapedType); + MlirType attrType = mlirAttributeGetType(elementAttr); + if (!mlirTypeEqual(shapedElementType, attrType)) { + std::string message = + "Shaped element type and attribute type must be equal: shaped="; + message.append(py::repr(py::cast(shapedType))); + message.append(", element="); + message.append(py::repr(py::cast(elementAttr))); + throw SetPyError(PyExc_ValueError, message); + } + + MlirAttribute elements = + mlirDenseElementsAttrSplatGet(shapedType, elementAttr); + return PyDenseElementsAttribute(contextWrapper->getRef(), elements); + } + + intptr_t dunderLen() { return mlirElementsAttrGetNumElements(*this); } + + py::buffer_info accessBuffer() { + MlirType shapedType = mlirAttributeGetType(*this); + MlirType elementType = mlirShapedTypeGetElementType(shapedType); + + if (mlirTypeIsAF32(elementType)) { + // f32 + return bufferInfo(shapedType, mlirDenseElementsAttrGetFloatValue); + } else if (mlirTypeIsAF64(elementType)) { + // f64 + return bufferInfo(shapedType, mlirDenseElementsAttrGetDoubleValue); + } else if (mlirTypeIsAInteger(elementType) && + mlirIntegerTypeGetWidth(elementType) == 32) { + if (mlirIntegerTypeIsSignless(elementType) || + mlirIntegerTypeIsSigned(elementType)) { + // i32 + return bufferInfo(shapedType, mlirDenseElementsAttrGetInt32Value); + } else if (mlirIntegerTypeIsUnsigned(elementType)) { + // unsigned i32 + return bufferInfo(shapedType, mlirDenseElementsAttrGetUInt32Value); + } + } else if (mlirTypeIsAInteger(elementType) && + mlirIntegerTypeGetWidth(elementType) == 64) { + if (mlirIntegerTypeIsSignless(elementType) || + mlirIntegerTypeIsSigned(elementType)) { + // i64 + return bufferInfo(shapedType, mlirDenseElementsAttrGetInt64Value); + } else if (mlirIntegerTypeIsUnsigned(elementType)) { + // unsigned i64 + return bufferInfo(shapedType, mlirDenseElementsAttrGetUInt64Value); + } + } + + std::string message = "unimplemented array format."; + throw SetPyError(PyExc_ValueError, message); + } + + static void bindDerived(ClassTy &c) { + c.def("__len__", &PyDenseElementsAttribute::dunderLen) + .def_static("get", PyDenseElementsAttribute::getFromBuffer, + py::arg("array"), py::arg("signless") = true, + py::arg("context") = py::none(), + "Gets from a buffer or ndarray") + .def_static("get_splat", PyDenseElementsAttribute::getSplat, + py::arg("shaped_type"), py::arg("element_attr"), + "Gets a DenseElementsAttr where all values are the same") + .def_property_readonly("is_splat", + [](PyDenseElementsAttribute &self) -> bool { + return mlirDenseElementsAttrIsSplat(self); + }) + .def_buffer(&PyDenseElementsAttribute::accessBuffer); + } + +private: + template + static MlirAttribute + bulkLoad(MlirContext context, + MlirAttribute (*ctor)(MlirType, intptr_t, ElementTy *), + MlirType mlirElementType, py::buffer_info &arrayInfo) { + SmallVector shape(arrayInfo.shape.begin(), + arrayInfo.shape.begin() + arrayInfo.ndim); + auto shapedType = + mlirRankedTensorTypeGet(shape.size(), shape.data(), mlirElementType); + intptr_t numElements = arrayInfo.size; + const ElementTy *contents = static_cast(arrayInfo.ptr); + return ctor(shapedType, numElements, contents); + } + + static bool isUnsignedIntegerFormat(const std::string &format) { + if (format.empty()) + return false; + char code = format[0]; + return code == 'I' || code == 'B' || code == 'H' || code == 'L' || + code == 'Q'; + } + + static bool isSignedIntegerFormat(const std::string &format) { + if (format.empty()) + return false; + char code = format[0]; + return code == 'i' || code == 'b' || code == 'h' || code == 'l' || + code == 'q'; + } + + template + py::buffer_info bufferInfo(MlirType shapedType, + Type (*value)(MlirAttribute, intptr_t)) { + intptr_t rank = mlirShapedTypeGetRank(shapedType); + // Prepare the data for the buffer_info. + // Buffer is configured for read-only access below. + Type *data = static_cast( + const_cast(mlirDenseElementsAttrGetRawData(*this))); + // Prepare the shape for the buffer_info. + SmallVector shape; + for (intptr_t i = 0; i < rank; ++i) + shape.push_back(mlirShapedTypeGetDimSize(shapedType, i)); + // Prepare the strides for the buffer_info. + SmallVector strides; + intptr_t strideFactor = 1; + for (intptr_t i = 1; i < rank; ++i) { + strideFactor = 1; + for (intptr_t j = i; j < rank; ++j) { + strideFactor *= mlirShapedTypeGetDimSize(shapedType, j); + } + strides.push_back(sizeof(Type) * strideFactor); + } + strides.push_back(sizeof(Type)); + return py::buffer_info(data, sizeof(Type), + py::format_descriptor::format(), rank, shape, + strides, /*readonly=*/true); + } +}; // namespace + +/// Refinement of the PyDenseElementsAttribute for attributes containing integer +/// (and boolean) values. Supports element access. +class PyDenseIntElementsAttribute + : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsADenseIntElements; + static constexpr const char *pyClassName = "DenseIntElementsAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + /// Returns the element at the given linear position. Asserts if the index is + /// out of range. + py::int_ dunderGetItem(intptr_t pos) { + if (pos < 0 || pos >= dunderLen()) { + throw SetPyError(PyExc_IndexError, + "attempt to access out of bounds element"); + } + + MlirType type = mlirAttributeGetType(*this); + type = mlirShapedTypeGetElementType(type); + assert(mlirTypeIsAInteger(type) && + "expected integer element type in dense int elements attribute"); + // Dispatch element extraction to an appropriate C function based on the + // elemental type of the attribute. py::int_ is implicitly constructible + // from any C++ integral type and handles bitwidth correctly. + // TODO: consider caching the type properties in the constructor to avoid + // querying them on each element access. + unsigned width = mlirIntegerTypeGetWidth(type); + bool isUnsigned = mlirIntegerTypeIsUnsigned(type); + if (isUnsigned) { + if (width == 1) { + return mlirDenseElementsAttrGetBoolValue(*this, pos); + } + if (width == 32) { + return mlirDenseElementsAttrGetUInt32Value(*this, pos); + } + if (width == 64) { + return mlirDenseElementsAttrGetUInt64Value(*this, pos); + } + } else { + if (width == 1) { + return mlirDenseElementsAttrGetBoolValue(*this, pos); + } + if (width == 32) { + return mlirDenseElementsAttrGetInt32Value(*this, pos); + } + if (width == 64) { + return mlirDenseElementsAttrGetInt64Value(*this, pos); + } + } + throw SetPyError(PyExc_TypeError, "Unsupported integer type"); + } + + static void bindDerived(ClassTy &c) { + c.def("__getitem__", &PyDenseIntElementsAttribute::dunderGetItem); + } +}; + +class PyDictAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsADictionary; + static constexpr const char *pyClassName = "DictAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + intptr_t dunderLen() { return mlirDictionaryAttrGetNumElements(*this); } + + static void bindDerived(ClassTy &c) { + c.def("__len__", &PyDictAttribute::dunderLen); + c.def_static( + "get", + [](py::dict attributes, DefaultingPyMlirContext context) { + SmallVector mlirNamedAttributes; + mlirNamedAttributes.reserve(attributes.size()); + for (auto &it : attributes) { + auto &mlir_attr = it.second.cast(); + auto name = it.first.cast(); + mlirNamedAttributes.push_back(mlirNamedAttributeGet( + mlirIdentifierGet(mlirAttributeGetContext(mlir_attr), + toMlirStringRef(name)), + mlir_attr)); + } + MlirAttribute attr = + mlirDictionaryAttrGet(context->get(), mlirNamedAttributes.size(), + mlirNamedAttributes.data()); + return PyDictAttribute(context->getRef(), attr); + }, + py::arg("value"), py::arg("context") = py::none(), + "Gets an uniqued dict attribute"); + c.def("__getitem__", [](PyDictAttribute &self, const std::string &name) { + MlirAttribute attr = + mlirDictionaryAttrGetElementByName(self, toMlirStringRef(name)); + if (mlirAttributeIsNull(attr)) { + throw SetPyError(PyExc_KeyError, + "attempt to access a non-existent attribute"); + } + return PyAttribute(self.getContext(), attr); + }); + c.def("__getitem__", [](PyDictAttribute &self, intptr_t index) { + if (index < 0 || index >= self.dunderLen()) { + throw SetPyError(PyExc_IndexError, + "attempt to access out of bounds attribute"); + } + MlirNamedAttribute namedAttr = mlirDictionaryAttrGetElement(self, index); + return PyNamedAttribute( + namedAttr.attribute, + std::string(mlirIdentifierStr(namedAttr.name).data)); + }); + } +}; + +/// Refinement of PyDenseElementsAttribute for attributes containing +/// floating-point values. Supports element access. +class PyDenseFPElementsAttribute + : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsADenseFPElements; + static constexpr const char *pyClassName = "DenseFPElementsAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + py::float_ dunderGetItem(intptr_t pos) { + if (pos < 0 || pos >= dunderLen()) { + throw SetPyError(PyExc_IndexError, + "attempt to access out of bounds element"); + } + + MlirType type = mlirAttributeGetType(*this); + type = mlirShapedTypeGetElementType(type); + // Dispatch element extraction to an appropriate C function based on the + // elemental type of the attribute. py::float_ is implicitly constructible + // from float and double. + // TODO: consider caching the type properties in the constructor to avoid + // querying them on each element access. + if (mlirTypeIsAF32(type)) { + return mlirDenseElementsAttrGetFloatValue(*this, pos); + } + if (mlirTypeIsAF64(type)) { + return mlirDenseElementsAttrGetDoubleValue(*this, pos); + } + throw SetPyError(PyExc_TypeError, "Unsupported floating-point type"); + } + + static void bindDerived(ClassTy &c) { + c.def("__getitem__", &PyDenseFPElementsAttribute::dunderGetItem); + } +}; + +class PyTypeAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAType; + static constexpr const char *pyClassName = "TypeAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](PyType value, DefaultingPyMlirContext context) { + MlirAttribute attr = mlirTypeAttrGet(value.get()); + return PyTypeAttribute(context->getRef(), attr); + }, + py::arg("value"), py::arg("context") = py::none(), + "Gets a uniqued Type attribute"); + c.def_property_readonly("value", [](PyTypeAttribute &self) { + return PyType(self.getContext()->getRef(), + mlirTypeAttrGetValue(self.get())); + }); + } +}; + +/// Unit Attribute subclass. Unit attributes don't have values. +class PyUnitAttribute : public PyConcreteAttribute { +public: + static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAUnit; + static constexpr const char *pyClassName = "UnitAttr"; + using PyConcreteAttribute::PyConcreteAttribute; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](DefaultingPyMlirContext context) { + return PyUnitAttribute(context->getRef(), + mlirUnitAttrGet(context->get())); + }, + py::arg("context") = py::none(), "Create a Unit attribute."); + } +}; + +} // namespace + +void mlir::python::populateIRAttributes(py::module &m) { + PyAffineMapAttribute::bind(m); + PyArrayAttribute::bind(m); + PyArrayAttribute::PyArrayAttributeIterator::bind(m); + PyBoolAttribute::bind(m); + PyDenseElementsAttribute::bind(m); + PyDenseFPElementsAttribute::bind(m); + PyDenseIntElementsAttribute::bind(m); + PyDictAttribute::bind(m); + PyFlatSymbolRefAttribute::bind(m); + PyFloatAttribute::bind(m); + PyIntegerAttribute::bind(m); + PyStringAttribute::bind(m); + PyTypeAttribute::bind(m); + PyUnitAttribute::bind(m); +} diff --git a/mlir/lib/Bindings/Python/IRModules.cpp b/mlir/lib/Bindings/Python/IRCore.cpp similarity index 52% rename from mlir/lib/Bindings/Python/IRModules.cpp rename to mlir/lib/Bindings/Python/IRCore.cpp index 6b4e5434d1d7..9d87aa52f7c8 100644 --- a/mlir/lib/Bindings/Python/IRModules.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -6,16 +6,14 @@ // //===----------------------------------------------------------------------===// -#include "IRModules.h" +#include "IRModule.h" #include "Globals.h" #include "PybindUtils.h" -#include "mlir-c/AffineMap.h" #include "mlir-c/Bindings/Python/Interop.h" #include "mlir-c/BuiltinAttributes.h" #include "mlir-c/BuiltinTypes.h" -#include "mlir-c/IntegerSet.h" #include "mlir-c/Registration.h" #include "llvm/ADT/SmallVector.h" #include @@ -138,12 +136,6 @@ py::object classmethod(Func f, Args... args) { return py::reinterpret_borrow((PyClassMethod_New(cf.ptr()))); } -/// Checks whether the given type is an integer or float type. -static int mlirTypeIsAIntegerOrFloat(MlirType type) { - return mlirTypeIsAInteger(type) || mlirTypeIsABF16(type) || - mlirTypeIsAF16(type) || mlirTypeIsAF32(type) || mlirTypeIsAF64(type); -} - static py::object createCustomDialectWrapper(const std::string &dialectNamespace, py::object dialectDescriptor) { @@ -161,21 +153,6 @@ static MlirStringRef toMlirStringRef(const std::string &s) { return mlirStringRefCreate(s.data(), s.size()); } -template -static bool isPermutation(std::vector permutation) { - llvm::SmallVector seen(permutation.size(), false); - for (auto val : permutation) { - if (val < permutation.size()) { - if (seen[val]) - return false; - seen[val] = true; - continue; - } - return false; - } - return true; -} - //------------------------------------------------------------------------------ // Collections. //------------------------------------------------------------------------------ @@ -1466,7 +1443,8 @@ namespace { /// CRTP base class for Python MLIR values that subclass Value and should be /// castable from it. The value hierarchy is one level deep and is not supposed /// to accommodate other levels unless core MLIR changes. -template class PyConcreteValue : public PyValue { +template +class PyConcreteValue : public PyValue { public: // Derived classes must define statics for: // IsAFunctionTy isaFunction @@ -1717,1910 +1695,169 @@ private: } // end namespace //------------------------------------------------------------------------------ -// Builtin attribute subclasses. +// Populates the core exports of the 'ir' submodule. //------------------------------------------------------------------------------ -namespace { - -/// CRTP base classes for Python attributes that subclass Attribute and should -/// be castable from it (i.e. via something like StringAttr(attr)). -/// By default, attribute class hierarchies are one level deep (i.e. a -/// concrete attribute class extends PyAttribute); however, intermediate -/// python-visible base classes can be modeled by specifying a BaseTy. -template -class PyConcreteAttribute : public BaseTy { -public: - // Derived classes must define statics for: - // IsAFunctionTy isaFunction - // const char *pyClassName - using ClassTy = py::class_; - using IsAFunctionTy = bool (*)(MlirAttribute); - - PyConcreteAttribute() = default; - PyConcreteAttribute(PyMlirContextRef contextRef, MlirAttribute attr) - : BaseTy(std::move(contextRef), attr) {} - PyConcreteAttribute(PyAttribute &orig) - : PyConcreteAttribute(orig.getContext(), castFrom(orig)) {} - - static MlirAttribute castFrom(PyAttribute &orig) { - if (!DerivedTy::isaFunction(orig)) { - auto origRepr = py::repr(py::cast(orig)).cast(); - throw SetPyError(PyExc_ValueError, Twine("Cannot cast attribute to ") + - DerivedTy::pyClassName + - " (from " + origRepr + ")"); - } - return orig; - } - - static void bind(py::module &m) { - auto cls = ClassTy(m, DerivedTy::pyClassName, py::buffer_protocol()); - cls.def(py::init(), py::keep_alive<0, 1>()); - DerivedTy::bindDerived(cls); - } - - /// Implemented by derived classes to add methods to the Python subclass. - static void bindDerived(ClassTy &m) {} -}; - -class PyAffineMapAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAAffineMap; - static constexpr const char *pyClassName = "AffineMapAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](PyAffineMap &affineMap) { - MlirAttribute attr = mlirAffineMapAttrGet(affineMap.get()); - return PyAffineMapAttribute(affineMap.getContext(), attr); - }, - py::arg("affine_map"), "Gets an attribute wrapping an AffineMap."); - } -}; - -class PyArrayAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAArray; - static constexpr const char *pyClassName = "ArrayAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - class PyArrayAttributeIterator { - public: - PyArrayAttributeIterator(PyAttribute attr) : attr(attr) {} - - PyArrayAttributeIterator &dunderIter() { return *this; } - - PyAttribute dunderNext() { - if (nextIndex >= mlirArrayAttrGetNumElements(attr.get())) { - throw py::stop_iteration(); - } - return PyAttribute(attr.getContext(), - mlirArrayAttrGetElement(attr.get(), nextIndex++)); - } - - static void bind(py::module &m) { - py::class_(m, "ArrayAttributeIterator") - .def("__iter__", &PyArrayAttributeIterator::dunderIter) - .def("__next__", &PyArrayAttributeIterator::dunderNext); - } - - private: - PyAttribute attr; - int nextIndex = 0; - }; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](py::list attributes, DefaultingPyMlirContext context) { - SmallVector mlirAttributes; - mlirAttributes.reserve(py::len(attributes)); - for (auto attribute : attributes) { - try { - mlirAttributes.push_back(attribute.cast()); - } catch (py::cast_error &err) { - std::string msg = std::string("Invalid attribute when attempting " - "to create an ArrayAttribute (") + - err.what() + ")"; - throw py::cast_error(msg); - } catch (py::reference_cast_error &err) { - // This exception seems thrown when the value is "None". - std::string msg = - std::string("Invalid attribute (None?) when attempting to " - "create an ArrayAttribute (") + - err.what() + ")"; - throw py::cast_error(msg); +void mlir::python::populateIRCore(py::module &m) { + //---------------------------------------------------------------------------- + // Mapping of MlirContext + //---------------------------------------------------------------------------- + py::class_(m, "Context") + .def(py::init<>(&PyMlirContext::createNewContextForInit)) + .def_static("_get_live_count", &PyMlirContext::getLiveCount) + .def("_get_context_again", + [](PyMlirContext &self) { + PyMlirContextRef ref = PyMlirContext::forContext(self.get()); + return ref.releaseObject(); + }) + .def("_get_live_operation_count", &PyMlirContext::getLiveOperationCount) + .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount) + .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, + &PyMlirContext::getCapsule) + .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyMlirContext::createFromCapsule) + .def("__enter__", &PyMlirContext::contextEnter) + .def("__exit__", &PyMlirContext::contextExit) + .def_property_readonly_static( + "current", + [](py::object & /*class*/) { + auto *context = PyThreadContextEntry::getDefaultContext(); + if (!context) + throw SetPyError(PyExc_ValueError, "No current Context"); + return context; + }, + "Gets the Context bound to the current thread or raises ValueError") + .def_property_readonly( + "dialects", + [](PyMlirContext &self) { return PyDialects(self.getRef()); }, + "Gets a container for accessing dialects by name") + .def_property_readonly( + "d", [](PyMlirContext &self) { return PyDialects(self.getRef()); }, + "Alias for 'dialect'") + .def( + "get_dialect_descriptor", + [=](PyMlirContext &self, std::string &name) { + MlirDialect dialect = mlirContextGetOrLoadDialect( + self.get(), {name.data(), name.size()}); + if (mlirDialectIsNull(dialect)) { + throw SetPyError(PyExc_ValueError, + Twine("Dialect '") + name + "' not found"); } - } - MlirAttribute attr = mlirArrayAttrGet( - context->get(), mlirAttributes.size(), mlirAttributes.data()); - return PyArrayAttribute(context->getRef(), attr); - }, - py::arg("attributes"), py::arg("context") = py::none(), - "Gets a uniqued Array attribute"); - c.def("__getitem__", - [](PyArrayAttribute &arr, intptr_t i) { - if (i >= mlirArrayAttrGetNumElements(arr)) - throw py::index_error("ArrayAttribute index out of range"); - return PyAttribute(arr.getContext(), - mlirArrayAttrGetElement(arr, i)); - }) - .def("__len__", - [](const PyArrayAttribute &arr) { - return mlirArrayAttrGetNumElements(arr); - }) - .def("__iter__", [](const PyArrayAttribute &arr) { - return PyArrayAttributeIterator(arr); - }); - } -}; - -/// Float Point Attribute subclass - FloatAttr. -class PyFloatAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAFloat; - static constexpr const char *pyClassName = "FloatAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](PyType &type, double value, DefaultingPyLocation loc) { - MlirAttribute attr = mlirFloatAttrDoubleGetChecked(loc, type, value); - // TODO: Rework error reporting once diagnostic engine is exposed - // in C API. - if (mlirAttributeIsNull(attr)) { - throw SetPyError(PyExc_ValueError, - Twine("invalid '") + - py::repr(py::cast(type)).cast() + - "' and expected floating point type."); - } - return PyFloatAttribute(type.getContext(), attr); - }, - py::arg("type"), py::arg("value"), py::arg("loc") = py::none(), - "Gets an uniqued float point attribute associated to a type"); - c.def_static( - "get_f32", - [](double value, DefaultingPyMlirContext context) { - MlirAttribute attr = mlirFloatAttrDoubleGet( - context->get(), mlirF32TypeGet(context->get()), value); - return PyFloatAttribute(context->getRef(), attr); - }, - py::arg("value"), py::arg("context") = py::none(), - "Gets an uniqued float point attribute associated to a f32 type"); - c.def_static( - "get_f64", - [](double value, DefaultingPyMlirContext context) { - MlirAttribute attr = mlirFloatAttrDoubleGet( - context->get(), mlirF64TypeGet(context->get()), value); - return PyFloatAttribute(context->getRef(), attr); - }, - py::arg("value"), py::arg("context") = py::none(), - "Gets an uniqued float point attribute associated to a f64 type"); - c.def_property_readonly( - "value", - [](PyFloatAttribute &self) { - return mlirFloatAttrGetValueDouble(self); - }, - "Returns the value of the float point attribute"); - } -}; - -/// Integer Attribute subclass - IntegerAttr. -class PyIntegerAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAInteger; - static constexpr const char *pyClassName = "IntegerAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](PyType &type, int64_t value) { - MlirAttribute attr = mlirIntegerAttrGet(type, value); - return PyIntegerAttribute(type.getContext(), attr); - }, - py::arg("type"), py::arg("value"), - "Gets an uniqued integer attribute associated to a type"); - c.def_property_readonly( - "value", - [](PyIntegerAttribute &self) { - return mlirIntegerAttrGetValueInt(self); - }, - "Returns the value of the integer attribute"); - } -}; - -/// Bool Attribute subclass - BoolAttr. -class PyBoolAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsABool; - static constexpr const char *pyClassName = "BoolAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](bool value, DefaultingPyMlirContext context) { - MlirAttribute attr = mlirBoolAttrGet(context->get(), value); - return PyBoolAttribute(context->getRef(), attr); - }, - py::arg("value"), py::arg("context") = py::none(), - "Gets an uniqued bool attribute"); - c.def_property_readonly( - "value", - [](PyBoolAttribute &self) { return mlirBoolAttrGetValue(self); }, - "Returns the value of the bool attribute"); - } -}; - -class PyFlatSymbolRefAttribute - : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAFlatSymbolRef; - static constexpr const char *pyClassName = "FlatSymbolRefAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](std::string value, DefaultingPyMlirContext context) { - MlirAttribute attr = - mlirFlatSymbolRefAttrGet(context->get(), toMlirStringRef(value)); - return PyFlatSymbolRefAttribute(context->getRef(), attr); - }, - py::arg("value"), py::arg("context") = py::none(), - "Gets a uniqued FlatSymbolRef attribute"); - c.def_property_readonly( - "value", - [](PyFlatSymbolRefAttribute &self) { - MlirStringRef stringRef = mlirFlatSymbolRefAttrGetValue(self); - return py::str(stringRef.data, stringRef.length); - }, - "Returns the value of the FlatSymbolRef attribute as a string"); - } -}; - -class PyStringAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAString; - static constexpr const char *pyClassName = "StringAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](std::string value, DefaultingPyMlirContext context) { - MlirAttribute attr = - mlirStringAttrGet(context->get(), toMlirStringRef(value)); - return PyStringAttribute(context->getRef(), attr); - }, - py::arg("value"), py::arg("context") = py::none(), - "Gets a uniqued string attribute"); - c.def_static( - "get_typed", - [](PyType &type, std::string value) { - MlirAttribute attr = - mlirStringAttrTypedGet(type, toMlirStringRef(value)); - return PyStringAttribute(type.getContext(), attr); - }, - - "Gets a uniqued string attribute associated to a type"); - c.def_property_readonly( - "value", - [](PyStringAttribute &self) { - MlirStringRef stringRef = mlirStringAttrGetValue(self); - return py::str(stringRef.data, stringRef.length); - }, - "Returns the value of the string attribute"); - } -}; - -// TODO: Support construction of bool elements. -// TODO: Support construction of string elements. -class PyDenseElementsAttribute - : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsADenseElements; - static constexpr const char *pyClassName = "DenseElementsAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static PyDenseElementsAttribute - getFromBuffer(py::buffer array, bool signless, - DefaultingPyMlirContext contextWrapper) { - // Request a contiguous view. In exotic cases, this will cause a copy. - int flags = PyBUF_C_CONTIGUOUS | PyBUF_FORMAT; - Py_buffer *view = new Py_buffer(); - if (PyObject_GetBuffer(array.ptr(), view, flags) != 0) { - delete view; - throw py::error_already_set(); - } - py::buffer_info arrayInfo(view); - - MlirContext context = contextWrapper->get(); - // Switch on the types that can be bulk loaded between the Python and - // MLIR-C APIs. - // See: https://docs.python.org/3/library/struct.html#format-characters - if (arrayInfo.format == "f") { - // f32 - assert(arrayInfo.itemsize == 4 && "mismatched array itemsize"); - return PyDenseElementsAttribute( - contextWrapper->getRef(), - bulkLoad(context, mlirDenseElementsAttrFloatGet, - mlirF32TypeGet(context), arrayInfo)); - } else if (arrayInfo.format == "d") { - // f64 - assert(arrayInfo.itemsize == 8 && "mismatched array itemsize"); - return PyDenseElementsAttribute( - contextWrapper->getRef(), - bulkLoad(context, mlirDenseElementsAttrDoubleGet, - mlirF64TypeGet(context), arrayInfo)); - } else if (isSignedIntegerFormat(arrayInfo.format)) { - if (arrayInfo.itemsize == 4) { - // i32 - MlirType elementType = signless ? mlirIntegerTypeGet(context, 32) - : mlirIntegerTypeSignedGet(context, 32); - return PyDenseElementsAttribute(contextWrapper->getRef(), - bulkLoad(context, - mlirDenseElementsAttrInt32Get, - elementType, arrayInfo)); - } else if (arrayInfo.itemsize == 8) { - // i64 - MlirType elementType = signless ? mlirIntegerTypeGet(context, 64) - : mlirIntegerTypeSignedGet(context, 64); - return PyDenseElementsAttribute(contextWrapper->getRef(), - bulkLoad(context, - mlirDenseElementsAttrInt64Get, - elementType, arrayInfo)); - } - } else if (isUnsignedIntegerFormat(arrayInfo.format)) { - if (arrayInfo.itemsize == 4) { - // unsigned i32 - MlirType elementType = signless - ? mlirIntegerTypeGet(context, 32) - : mlirIntegerTypeUnsignedGet(context, 32); - return PyDenseElementsAttribute(contextWrapper->getRef(), - bulkLoad(context, - mlirDenseElementsAttrUInt32Get, - elementType, arrayInfo)); - } else if (arrayInfo.itemsize == 8) { - // unsigned i64 - MlirType elementType = signless - ? mlirIntegerTypeGet(context, 64) - : mlirIntegerTypeUnsignedGet(context, 64); - return PyDenseElementsAttribute(contextWrapper->getRef(), - bulkLoad(context, - mlirDenseElementsAttrUInt64Get, - elementType, arrayInfo)); - } - } - - // TODO: Fall back to string-based get. - std::string message = "unimplemented array format conversion from format: "; - message.append(arrayInfo.format); - throw SetPyError(PyExc_ValueError, message); - } - - static PyDenseElementsAttribute getSplat(PyType shapedType, - PyAttribute &elementAttr) { - auto contextWrapper = - PyMlirContext::forContext(mlirTypeGetContext(shapedType)); - if (!mlirAttributeIsAInteger(elementAttr) && - !mlirAttributeIsAFloat(elementAttr)) { - std::string message = "Illegal element type for DenseElementsAttr: "; - message.append(py::repr(py::cast(elementAttr))); - throw SetPyError(PyExc_ValueError, message); - } - if (!mlirTypeIsAShaped(shapedType) || - !mlirShapedTypeHasStaticShape(shapedType)) { - std::string message = - "Expected a static ShapedType for the shaped_type parameter: "; - message.append(py::repr(py::cast(shapedType))); - throw SetPyError(PyExc_ValueError, message); - } - MlirType shapedElementType = mlirShapedTypeGetElementType(shapedType); - MlirType attrType = mlirAttributeGetType(elementAttr); - if (!mlirTypeEqual(shapedElementType, attrType)) { - std::string message = - "Shaped element type and attribute type must be equal: shaped="; - message.append(py::repr(py::cast(shapedType))); - message.append(", element="); - message.append(py::repr(py::cast(elementAttr))); - throw SetPyError(PyExc_ValueError, message); - } - - MlirAttribute elements = - mlirDenseElementsAttrSplatGet(shapedType, elementAttr); - return PyDenseElementsAttribute(contextWrapper->getRef(), elements); - } + return PyDialectDescriptor(self.getRef(), dialect); + }, + "Gets or loads a dialect by name, returning its descriptor object") + .def_property( + "allow_unregistered_dialects", + [](PyMlirContext &self) -> bool { + return mlirContextGetAllowUnregisteredDialects(self.get()); + }, + [](PyMlirContext &self, bool value) { + mlirContextSetAllowUnregisteredDialects(self.get(), value); + }); - intptr_t dunderLen() { return mlirElementsAttrGetNumElements(*this); } - - py::buffer_info accessBuffer() { - MlirType shapedType = mlirAttributeGetType(*this); - MlirType elementType = mlirShapedTypeGetElementType(shapedType); - - if (mlirTypeIsAF32(elementType)) { - // f32 - return bufferInfo(shapedType, mlirDenseElementsAttrGetFloatValue); - } else if (mlirTypeIsAF64(elementType)) { - // f64 - return bufferInfo(shapedType, mlirDenseElementsAttrGetDoubleValue); - } else if (mlirTypeIsAInteger(elementType) && - mlirIntegerTypeGetWidth(elementType) == 32) { - if (mlirIntegerTypeIsSignless(elementType) || - mlirIntegerTypeIsSigned(elementType)) { - // i32 - return bufferInfo(shapedType, mlirDenseElementsAttrGetInt32Value); - } else if (mlirIntegerTypeIsUnsigned(elementType)) { - // unsigned i32 - return bufferInfo(shapedType, mlirDenseElementsAttrGetUInt32Value); - } - } else if (mlirTypeIsAInteger(elementType) && - mlirIntegerTypeGetWidth(elementType) == 64) { - if (mlirIntegerTypeIsSignless(elementType) || - mlirIntegerTypeIsSigned(elementType)) { - // i64 - return bufferInfo(shapedType, mlirDenseElementsAttrGetInt64Value); - } else if (mlirIntegerTypeIsUnsigned(elementType)) { - // unsigned i64 - return bufferInfo(shapedType, mlirDenseElementsAttrGetUInt64Value); - } - } + //---------------------------------------------------------------------------- + // Mapping of PyDialectDescriptor + //---------------------------------------------------------------------------- + py::class_(m, "DialectDescriptor") + .def_property_readonly("namespace", + [](PyDialectDescriptor &self) { + MlirStringRef ns = + mlirDialectGetNamespace(self.get()); + return py::str(ns.data, ns.length); + }) + .def("__repr__", [](PyDialectDescriptor &self) { + MlirStringRef ns = mlirDialectGetNamespace(self.get()); + std::string repr(""); + return repr; + }); - std::string message = "unimplemented array format."; - throw SetPyError(PyExc_ValueError, message); - } + //---------------------------------------------------------------------------- + // Mapping of PyDialects + //---------------------------------------------------------------------------- + py::class_(m, "Dialects") + .def("__getitem__", + [=](PyDialects &self, std::string keyName) { + MlirDialect dialect = + self.getDialectForKey(keyName, /*attrError=*/false); + py::object descriptor = + py::cast(PyDialectDescriptor{self.getContext(), dialect}); + return createCustomDialectWrapper(keyName, std::move(descriptor)); + }) + .def("__getattr__", [=](PyDialects &self, std::string attrName) { + MlirDialect dialect = + self.getDialectForKey(attrName, /*attrError=*/true); + py::object descriptor = + py::cast(PyDialectDescriptor{self.getContext(), dialect}); + return createCustomDialectWrapper(attrName, std::move(descriptor)); + }); - static void bindDerived(ClassTy &c) { - c.def("__len__", &PyDenseElementsAttribute::dunderLen) - .def_static("get", PyDenseElementsAttribute::getFromBuffer, - py::arg("array"), py::arg("signless") = true, - py::arg("context") = py::none(), - "Gets from a buffer or ndarray") - .def_static("get_splat", PyDenseElementsAttribute::getSplat, - py::arg("shaped_type"), py::arg("element_attr"), - "Gets a DenseElementsAttr where all values are the same") - .def_property_readonly("is_splat", - [](PyDenseElementsAttribute &self) -> bool { - return mlirDenseElementsAttrIsSplat(self); - }) - .def_buffer(&PyDenseElementsAttribute::accessBuffer); - } + //---------------------------------------------------------------------------- + // Mapping of PyDialect + //---------------------------------------------------------------------------- + py::class_(m, "Dialect") + .def(py::init(), "descriptor") + .def_property_readonly( + "descriptor", [](PyDialect &self) { return self.getDescriptor(); }) + .def("__repr__", [](py::object self) { + auto clazz = self.attr("__class__"); + return py::str(""); + }); -private: - template - static MlirAttribute - bulkLoad(MlirContext context, - MlirAttribute (*ctor)(MlirType, intptr_t, ElementTy *), - MlirType mlirElementType, py::buffer_info &arrayInfo) { - SmallVector shape(arrayInfo.shape.begin(), - arrayInfo.shape.begin() + arrayInfo.ndim); - auto shapedType = - mlirRankedTensorTypeGet(shape.size(), shape.data(), mlirElementType); - intptr_t numElements = arrayInfo.size; - const ElementTy *contents = static_cast(arrayInfo.ptr); - return ctor(shapedType, numElements, contents); - } - - static bool isUnsignedIntegerFormat(const std::string &format) { - if (format.empty()) - return false; - char code = format[0]; - return code == 'I' || code == 'B' || code == 'H' || code == 'L' || - code == 'Q'; - } - - static bool isSignedIntegerFormat(const std::string &format) { - if (format.empty()) - return false; - char code = format[0]; - return code == 'i' || code == 'b' || code == 'h' || code == 'l' || - code == 'q'; - } - - template - py::buffer_info bufferInfo(MlirType shapedType, - Type (*value)(MlirAttribute, intptr_t)) { - intptr_t rank = mlirShapedTypeGetRank(shapedType); - // Prepare the data for the buffer_info. - // Buffer is configured for read-only access below. - Type *data = static_cast( - const_cast(mlirDenseElementsAttrGetRawData(*this))); - // Prepare the shape for the buffer_info. - SmallVector shape; - for (intptr_t i = 0; i < rank; ++i) - shape.push_back(mlirShapedTypeGetDimSize(shapedType, i)); - // Prepare the strides for the buffer_info. - SmallVector strides; - intptr_t strideFactor = 1; - for (intptr_t i = 1; i < rank; ++i) { - strideFactor = 1; - for (intptr_t j = i; j < rank; ++j) { - strideFactor *= mlirShapedTypeGetDimSize(shapedType, j); - } - strides.push_back(sizeof(Type) * strideFactor); - } - strides.push_back(sizeof(Type)); - return py::buffer_info(data, sizeof(Type), - py::format_descriptor::format(), rank, shape, - strides, /*readonly=*/true); - } -}; // namespace - -/// Refinement of the PyDenseElementsAttribute for attributes containing integer -/// (and boolean) values. Supports element access. -class PyDenseIntElementsAttribute - : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsADenseIntElements; - static constexpr const char *pyClassName = "DenseIntElementsAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - /// Returns the element at the given linear position. Asserts if the index is - /// out of range. - py::int_ dunderGetItem(intptr_t pos) { - if (pos < 0 || pos >= dunderLen()) { - throw SetPyError(PyExc_IndexError, - "attempt to access out of bounds element"); - } - - MlirType type = mlirAttributeGetType(*this); - type = mlirShapedTypeGetElementType(type); - assert(mlirTypeIsAInteger(type) && - "expected integer element type in dense int elements attribute"); - // Dispatch element extraction to an appropriate C function based on the - // elemental type of the attribute. py::int_ is implicitly constructible - // from any C++ integral type and handles bitwidth correctly. - // TODO: consider caching the type properties in the constructor to avoid - // querying them on each element access. - unsigned width = mlirIntegerTypeGetWidth(type); - bool isUnsigned = mlirIntegerTypeIsUnsigned(type); - if (isUnsigned) { - if (width == 1) { - return mlirDenseElementsAttrGetBoolValue(*this, pos); - } - if (width == 32) { - return mlirDenseElementsAttrGetUInt32Value(*this, pos); - } - if (width == 64) { - return mlirDenseElementsAttrGetUInt64Value(*this, pos); - } - } else { - if (width == 1) { - return mlirDenseElementsAttrGetBoolValue(*this, pos); - } - if (width == 32) { - return mlirDenseElementsAttrGetInt32Value(*this, pos); - } - if (width == 64) { - return mlirDenseElementsAttrGetInt64Value(*this, pos); - } - } - throw SetPyError(PyExc_TypeError, "Unsupported integer type"); - } - - static void bindDerived(ClassTy &c) { - c.def("__getitem__", &PyDenseIntElementsAttribute::dunderGetItem); - } -}; - -class PyDictAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsADictionary; - static constexpr const char *pyClassName = "DictAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - intptr_t dunderLen() { return mlirDictionaryAttrGetNumElements(*this); } - - static void bindDerived(ClassTy &c) { - c.def("__len__", &PyDictAttribute::dunderLen); - c.def_static( - "get", - [](py::dict attributes, DefaultingPyMlirContext context) { - SmallVector mlirNamedAttributes; - mlirNamedAttributes.reserve(attributes.size()); - for (auto &it : attributes) { - auto &mlir_attr = it.second.cast(); - auto name = it.first.cast(); - mlirNamedAttributes.push_back(mlirNamedAttributeGet( - mlirIdentifierGet(mlirAttributeGetContext(mlir_attr), - toMlirStringRef(name)), - mlir_attr)); - } - MlirAttribute attr = - mlirDictionaryAttrGet(context->get(), mlirNamedAttributes.size(), - mlirNamedAttributes.data()); - return PyDictAttribute(context->getRef(), attr); - }, - py::arg("value"), py::arg("context") = py::none(), - "Gets an uniqued dict attribute"); - c.def("__getitem__", [](PyDictAttribute &self, const std::string &name) { - MlirAttribute attr = - mlirDictionaryAttrGetElementByName(self, toMlirStringRef(name)); - if (mlirAttributeIsNull(attr)) { - throw SetPyError(PyExc_KeyError, - "attempt to access a non-existent attribute"); - } - return PyAttribute(self.getContext(), attr); - }); - c.def("__getitem__", [](PyDictAttribute &self, intptr_t index) { - if (index < 0 || index >= self.dunderLen()) { - throw SetPyError(PyExc_IndexError, - "attempt to access out of bounds attribute"); - } - MlirNamedAttribute namedAttr = mlirDictionaryAttrGetElement(self, index); - return PyNamedAttribute( - namedAttr.attribute, - std::string(mlirIdentifierStr(namedAttr.name).data)); - }); - } -}; - -/// Refinement of PyDenseElementsAttribute for attributes containing -/// floating-point values. Supports element access. -class PyDenseFPElementsAttribute - : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsADenseFPElements; - static constexpr const char *pyClassName = "DenseFPElementsAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - py::float_ dunderGetItem(intptr_t pos) { - if (pos < 0 || pos >= dunderLen()) { - throw SetPyError(PyExc_IndexError, - "attempt to access out of bounds element"); - } - - MlirType type = mlirAttributeGetType(*this); - type = mlirShapedTypeGetElementType(type); - // Dispatch element extraction to an appropriate C function based on the - // elemental type of the attribute. py::float_ is implicitly constructible - // from float and double. - // TODO: consider caching the type properties in the constructor to avoid - // querying them on each element access. - if (mlirTypeIsAF32(type)) { - return mlirDenseElementsAttrGetFloatValue(*this, pos); - } - if (mlirTypeIsAF64(type)) { - return mlirDenseElementsAttrGetDoubleValue(*this, pos); - } - throw SetPyError(PyExc_TypeError, "Unsupported floating-point type"); - } - - static void bindDerived(ClassTy &c) { - c.def("__getitem__", &PyDenseFPElementsAttribute::dunderGetItem); - } -}; - -class PyTypeAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAType; - static constexpr const char *pyClassName = "TypeAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](PyType value, DefaultingPyMlirContext context) { - MlirAttribute attr = mlirTypeAttrGet(value.get()); - return PyTypeAttribute(context->getRef(), attr); - }, - py::arg("value"), py::arg("context") = py::none(), - "Gets a uniqued Type attribute"); - c.def_property_readonly("value", [](PyTypeAttribute &self) { - return PyType(self.getContext()->getRef(), - mlirTypeAttrGetValue(self.get())); - }); - } -}; - -/// Unit Attribute subclass. Unit attributes don't have values. -class PyUnitAttribute : public PyConcreteAttribute { -public: - static constexpr IsAFunctionTy isaFunction = mlirAttributeIsAUnit; - static constexpr const char *pyClassName = "UnitAttr"; - using PyConcreteAttribute::PyConcreteAttribute; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](DefaultingPyMlirContext context) { - return PyUnitAttribute(context->getRef(), - mlirUnitAttrGet(context->get())); - }, - py::arg("context") = py::none(), "Create a Unit attribute."); - } -}; - -} // namespace - -//------------------------------------------------------------------------------ -// Builtin type subclasses. -//------------------------------------------------------------------------------ - -namespace { - -/// CRTP base classes for Python types that subclass Type and should be -/// castable from it (i.e. via something like IntegerType(t)). -/// By default, type class hierarchies are one level deep (i.e. a -/// concrete type class extends PyType); however, intermediate python-visible -/// base classes can be modeled by specifying a BaseTy. -template -class PyConcreteType : public BaseTy { -public: - // Derived classes must define statics for: - // IsAFunctionTy isaFunction - // const char *pyClassName - using ClassTy = py::class_; - using IsAFunctionTy = bool (*)(MlirType); - - PyConcreteType() = default; - PyConcreteType(PyMlirContextRef contextRef, MlirType t) - : BaseTy(std::move(contextRef), t) {} - PyConcreteType(PyType &orig) - : PyConcreteType(orig.getContext(), castFrom(orig)) {} - - static MlirType castFrom(PyType &orig) { - if (!DerivedTy::isaFunction(orig)) { - auto origRepr = py::repr(py::cast(orig)).cast(); - throw SetPyError(PyExc_ValueError, Twine("Cannot cast type to ") + - DerivedTy::pyClassName + - " (from " + origRepr + ")"); - } - return orig; - } - - static void bind(py::module &m) { - auto cls = ClassTy(m, DerivedTy::pyClassName); - cls.def(py::init(), py::keep_alive<0, 1>()); - cls.def_static("isinstance", [](PyType &otherType) -> bool { - return DerivedTy::isaFunction(otherType); - }); - DerivedTy::bindDerived(cls); - } - - /// Implemented by derived classes to add methods to the Python subclass. - static void bindDerived(ClassTy &m) {} -}; - -class PyIntegerType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAInteger; - static constexpr const char *pyClassName = "IntegerType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get_signless", - [](unsigned width, DefaultingPyMlirContext context) { - MlirType t = mlirIntegerTypeGet(context->get(), width); - return PyIntegerType(context->getRef(), t); - }, - py::arg("width"), py::arg("context") = py::none(), - "Create a signless integer type"); - c.def_static( - "get_signed", - [](unsigned width, DefaultingPyMlirContext context) { - MlirType t = mlirIntegerTypeSignedGet(context->get(), width); - return PyIntegerType(context->getRef(), t); - }, - py::arg("width"), py::arg("context") = py::none(), - "Create a signed integer type"); - c.def_static( - "get_unsigned", - [](unsigned width, DefaultingPyMlirContext context) { - MlirType t = mlirIntegerTypeUnsignedGet(context->get(), width); - return PyIntegerType(context->getRef(), t); - }, - py::arg("width"), py::arg("context") = py::none(), - "Create an unsigned integer type"); - c.def_property_readonly( - "width", - [](PyIntegerType &self) { return mlirIntegerTypeGetWidth(self); }, - "Returns the width of the integer type"); - c.def_property_readonly( - "is_signless", - [](PyIntegerType &self) -> bool { - return mlirIntegerTypeIsSignless(self); - }, - "Returns whether this is a signless integer"); - c.def_property_readonly( - "is_signed", - [](PyIntegerType &self) -> bool { - return mlirIntegerTypeIsSigned(self); - }, - "Returns whether this is a signed integer"); - c.def_property_readonly( - "is_unsigned", - [](PyIntegerType &self) -> bool { - return mlirIntegerTypeIsUnsigned(self); - }, - "Returns whether this is an unsigned integer"); - } -}; - -/// Index Type subclass - IndexType. -class PyIndexType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAIndex; - static constexpr const char *pyClassName = "IndexType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](DefaultingPyMlirContext context) { - MlirType t = mlirIndexTypeGet(context->get()); - return PyIndexType(context->getRef(), t); - }, - py::arg("context") = py::none(), "Create a index type."); - } -}; - -/// Floating Point Type subclass - BF16Type. -class PyBF16Type : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsABF16; - static constexpr const char *pyClassName = "BF16Type"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](DefaultingPyMlirContext context) { - MlirType t = mlirBF16TypeGet(context->get()); - return PyBF16Type(context->getRef(), t); - }, - py::arg("context") = py::none(), "Create a bf16 type."); - } -}; - -/// Floating Point Type subclass - F16Type. -class PyF16Type : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAF16; - static constexpr const char *pyClassName = "F16Type"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](DefaultingPyMlirContext context) { - MlirType t = mlirF16TypeGet(context->get()); - return PyF16Type(context->getRef(), t); - }, - py::arg("context") = py::none(), "Create a f16 type."); - } -}; - -/// Floating Point Type subclass - F32Type. -class PyF32Type : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAF32; - static constexpr const char *pyClassName = "F32Type"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](DefaultingPyMlirContext context) { - MlirType t = mlirF32TypeGet(context->get()); - return PyF32Type(context->getRef(), t); - }, - py::arg("context") = py::none(), "Create a f32 type."); - } -}; - -/// Floating Point Type subclass - F64Type. -class PyF64Type : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAF64; - static constexpr const char *pyClassName = "F64Type"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](DefaultingPyMlirContext context) { - MlirType t = mlirF64TypeGet(context->get()); - return PyF64Type(context->getRef(), t); - }, - py::arg("context") = py::none(), "Create a f64 type."); - } -}; - -/// None Type subclass - NoneType. -class PyNoneType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsANone; - static constexpr const char *pyClassName = "NoneType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](DefaultingPyMlirContext context) { - MlirType t = mlirNoneTypeGet(context->get()); - return PyNoneType(context->getRef(), t); - }, - py::arg("context") = py::none(), "Create a none type."); - } -}; - -/// Complex Type subclass - ComplexType. -class PyComplexType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAComplex; - static constexpr const char *pyClassName = "ComplexType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](PyType &elementType) { - // The element must be a floating point or integer scalar type. - if (mlirTypeIsAIntegerOrFloat(elementType)) { - MlirType t = mlirComplexTypeGet(elementType); - return PyComplexType(elementType.getContext(), t); - } - throw SetPyError( - PyExc_ValueError, - Twine("invalid '") + - py::repr(py::cast(elementType)).cast() + - "' and expected floating point or integer type."); - }, - "Create a complex type"); - c.def_property_readonly( - "element_type", - [](PyComplexType &self) -> PyType { - MlirType t = mlirComplexTypeGetElementType(self); - return PyType(self.getContext(), t); - }, - "Returns element type."); - } -}; - -class PyShapedType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAShaped; - static constexpr const char *pyClassName = "ShapedType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_property_readonly( - "element_type", - [](PyShapedType &self) { - MlirType t = mlirShapedTypeGetElementType(self); - return PyType(self.getContext(), t); - }, - "Returns the element type of the shaped type."); - c.def_property_readonly( - "has_rank", - [](PyShapedType &self) -> bool { return mlirShapedTypeHasRank(self); }, - "Returns whether the given shaped type is ranked."); - c.def_property_readonly( - "rank", - [](PyShapedType &self) { - self.requireHasRank(); - return mlirShapedTypeGetRank(self); - }, - "Returns the rank of the given ranked shaped type."); - c.def_property_readonly( - "has_static_shape", - [](PyShapedType &self) -> bool { - return mlirShapedTypeHasStaticShape(self); - }, - "Returns whether the given shaped type has a static shape."); - c.def( - "is_dynamic_dim", - [](PyShapedType &self, intptr_t dim) -> bool { - self.requireHasRank(); - return mlirShapedTypeIsDynamicDim(self, dim); - }, - "Returns whether the dim-th dimension of the given shaped type is " - "dynamic."); - c.def( - "get_dim_size", - [](PyShapedType &self, intptr_t dim) { - self.requireHasRank(); - return mlirShapedTypeGetDimSize(self, dim); - }, - "Returns the dim-th dimension of the given ranked shaped type."); - c.def_static( - "is_dynamic_size", - [](int64_t size) -> bool { return mlirShapedTypeIsDynamicSize(size); }, - "Returns whether the given dimension size indicates a dynamic " - "dimension."); - c.def( - "is_dynamic_stride_or_offset", - [](PyShapedType &self, int64_t val) -> bool { - self.requireHasRank(); - return mlirShapedTypeIsDynamicStrideOrOffset(val); - }, - "Returns whether the given value is used as a placeholder for dynamic " - "strides and offsets in shaped types."); - } - -private: - void requireHasRank() { - if (!mlirShapedTypeHasRank(*this)) { - throw SetPyError( - PyExc_ValueError, - "calling this method requires that the type has a rank."); - } - } -}; - -/// Vector Type subclass - VectorType. -class PyVectorType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAVector; - static constexpr const char *pyClassName = "VectorType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](std::vector shape, PyType &elementType, - DefaultingPyLocation loc) { - MlirType t = mlirVectorTypeGetChecked(loc, shape.size(), shape.data(), - elementType); - // TODO: Rework error reporting once diagnostic engine is exposed - // in C API. - if (mlirTypeIsNull(t)) { - throw SetPyError( - PyExc_ValueError, - Twine("invalid '") + - py::repr(py::cast(elementType)).cast() + - "' and expected floating point or integer type."); - } - return PyVectorType(elementType.getContext(), t); - }, - py::arg("shape"), py::arg("elementType"), py::arg("loc") = py::none(), - "Create a vector type"); - } -}; - -/// Ranked Tensor Type subclass - RankedTensorType. -class PyRankedTensorType - : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsARankedTensor; - static constexpr const char *pyClassName = "RankedTensorType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](std::vector shape, PyType &elementType, - DefaultingPyLocation loc) { - MlirType t = mlirRankedTensorTypeGetChecked( - loc, shape.size(), shape.data(), elementType); - // TODO: Rework error reporting once diagnostic engine is exposed - // in C API. - if (mlirTypeIsNull(t)) { - throw SetPyError( - PyExc_ValueError, - Twine("invalid '") + - py::repr(py::cast(elementType)).cast() + - "' and expected floating point, integer, vector or " - "complex " - "type."); - } - return PyRankedTensorType(elementType.getContext(), t); - }, - py::arg("shape"), py::arg("element_type"), py::arg("loc") = py::none(), - "Create a ranked tensor type"); - } -}; - -/// Unranked Tensor Type subclass - UnrankedTensorType. -class PyUnrankedTensorType - : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAUnrankedTensor; - static constexpr const char *pyClassName = "UnrankedTensorType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](PyType &elementType, DefaultingPyLocation loc) { - MlirType t = mlirUnrankedTensorTypeGetChecked(loc, elementType); - // TODO: Rework error reporting once diagnostic engine is exposed - // in C API. - if (mlirTypeIsNull(t)) { - throw SetPyError( - PyExc_ValueError, - Twine("invalid '") + - py::repr(py::cast(elementType)).cast() + - "' and expected floating point, integer, vector or " - "complex " - "type."); - } - return PyUnrankedTensorType(elementType.getContext(), t); - }, - py::arg("element_type"), py::arg("loc") = py::none(), - "Create a unranked tensor type"); - } -}; - -class PyMemRefLayoutMapList; - -/// Ranked MemRef Type subclass - MemRefType. -class PyMemRefType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsARankedTensor; - static constexpr const char *pyClassName = "MemRefType"; - using PyConcreteType::PyConcreteType; - - PyMemRefLayoutMapList getLayout(); - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](std::vector shape, PyType &elementType, - std::vector layout, PyAttribute *memorySpace, - DefaultingPyLocation loc) { - SmallVector maps; - maps.reserve(layout.size()); - for (PyAffineMap &map : layout) - maps.push_back(map); - - MlirAttribute memSpaceAttr = {}; - if (memorySpace) - memSpaceAttr = *memorySpace; - - MlirType t = mlirMemRefTypeGetChecked(loc, elementType, shape.size(), - shape.data(), maps.size(), - maps.data(), memSpaceAttr); - // TODO: Rework error reporting once diagnostic engine is exposed - // in C API. - if (mlirTypeIsNull(t)) { - throw SetPyError( - PyExc_ValueError, - Twine("invalid '") + - py::repr(py::cast(elementType)).cast() + - "' and expected floating point, integer, vector or " - "complex " - "type."); - } - return PyMemRefType(elementType.getContext(), t); - }, - py::arg("shape"), py::arg("element_type"), - py::arg("layout") = py::list(), py::arg("memory_space") = py::none(), - py::arg("loc") = py::none(), "Create a memref type") - .def_property_readonly("layout", &PyMemRefType::getLayout, - "The list of layout maps of the MemRef type.") - .def_property_readonly( - "memory_space", - [](PyMemRefType &self) -> PyAttribute { - MlirAttribute a = mlirMemRefTypeGetMemorySpace(self); - return PyAttribute(self.getContext(), a); - }, - "Returns the memory space of the given MemRef type."); - } -}; - -/// A list of affine layout maps in a memref type. Internally, these are stored -/// as consecutive elements, random access is cheap. Both the type and the maps -/// are owned by the context, no need to worry about lifetime extension. -class PyMemRefLayoutMapList - : public Sliceable { -public: - static constexpr const char *pyClassName = "MemRefLayoutMapList"; - - PyMemRefLayoutMapList(PyMemRefType type, intptr_t startIndex = 0, - intptr_t length = -1, intptr_t step = 1) - : Sliceable(startIndex, - length == -1 ? mlirMemRefTypeGetNumAffineMaps(type) : length, - step), - memref(type) {} - - intptr_t getNumElements() { return mlirMemRefTypeGetNumAffineMaps(memref); } - - PyAffineMap getElement(intptr_t index) { - return PyAffineMap(memref.getContext(), - mlirMemRefTypeGetAffineMap(memref, index)); - } - - PyMemRefLayoutMapList slice(intptr_t startIndex, intptr_t length, - intptr_t step) { - return PyMemRefLayoutMapList(memref, startIndex, length, step); - } - -private: - PyMemRefType memref; -}; - -PyMemRefLayoutMapList PyMemRefType::getLayout() { - return PyMemRefLayoutMapList(*this); -} - -/// Unranked MemRef Type subclass - UnrankedMemRefType. -class PyUnrankedMemRefType - : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAUnrankedMemRef; - static constexpr const char *pyClassName = "UnrankedMemRefType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](PyType &elementType, PyAttribute *memorySpace, - DefaultingPyLocation loc) { - MlirAttribute memSpaceAttr = {}; - if (memorySpace) - memSpaceAttr = *memorySpace; - - MlirType t = - mlirUnrankedMemRefTypeGetChecked(loc, elementType, memSpaceAttr); - // TODO: Rework error reporting once diagnostic engine is exposed - // in C API. - if (mlirTypeIsNull(t)) { - throw SetPyError( - PyExc_ValueError, - Twine("invalid '") + - py::repr(py::cast(elementType)).cast() + - "' and expected floating point, integer, vector or " - "complex " - "type."); - } - return PyUnrankedMemRefType(elementType.getContext(), t); - }, - py::arg("element_type"), py::arg("memory_space"), - py::arg("loc") = py::none(), "Create a unranked memref type") - .def_property_readonly( - "memory_space", - [](PyUnrankedMemRefType &self) -> PyAttribute { - MlirAttribute a = mlirMemRefTypeGetMemorySpace(self); - return PyAttribute(self.getContext(), a); - }, - "Returns the memory space of the given Unranked MemRef type."); - } -}; - -/// Tuple Type subclass - TupleType. -class PyTupleType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsATuple; - static constexpr const char *pyClassName = "TupleType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get_tuple", - [](py::list elementList, DefaultingPyMlirContext context) { - intptr_t num = py::len(elementList); - // Mapping py::list to SmallVector. - SmallVector elements; - for (auto element : elementList) - elements.push_back(element.cast()); - MlirType t = mlirTupleTypeGet(context->get(), num, elements.data()); - return PyTupleType(context->getRef(), t); - }, - py::arg("elements"), py::arg("context") = py::none(), - "Create a tuple type"); - c.def( - "get_type", - [](PyTupleType &self, intptr_t pos) -> PyType { - MlirType t = mlirTupleTypeGetType(self, pos); - return PyType(self.getContext(), t); - }, - "Returns the pos-th type in the tuple type."); - c.def_property_readonly( - "num_types", - [](PyTupleType &self) -> intptr_t { - return mlirTupleTypeGetNumTypes(self); - }, - "Returns the number of types contained in a tuple."); - } -}; - -/// Function type. -class PyFunctionType : public PyConcreteType { -public: - static constexpr IsAFunctionTy isaFunction = mlirTypeIsAFunction; - static constexpr const char *pyClassName = "FunctionType"; - using PyConcreteType::PyConcreteType; - - static void bindDerived(ClassTy &c) { - c.def_static( - "get", - [](std::vector inputs, std::vector results, - DefaultingPyMlirContext context) { - SmallVector inputsRaw(inputs.begin(), inputs.end()); - SmallVector resultsRaw(results.begin(), results.end()); - MlirType t = mlirFunctionTypeGet(context->get(), inputsRaw.size(), - inputsRaw.data(), resultsRaw.size(), - resultsRaw.data()); - return PyFunctionType(context->getRef(), t); - }, - py::arg("inputs"), py::arg("results"), py::arg("context") = py::none(), - "Gets a FunctionType from a list of input and result types"); - c.def_property_readonly( - "inputs", - [](PyFunctionType &self) { - MlirType t = self; - auto contextRef = self.getContext(); - py::list types; - for (intptr_t i = 0, e = mlirFunctionTypeGetNumInputs(self); i < e; - ++i) { - types.append(PyType(contextRef, mlirFunctionTypeGetInput(t, i))); - } - return types; - }, - "Returns the list of input types in the FunctionType."); - c.def_property_readonly( - "results", - [](PyFunctionType &self) { - auto contextRef = self.getContext(); - py::list types; - for (intptr_t i = 0, e = mlirFunctionTypeGetNumResults(self); i < e; - ++i) { - types.append( - PyType(contextRef, mlirFunctionTypeGetResult(self, i))); - } - return types; - }, - "Returns the list of result types in the FunctionType."); - } -}; - -} // namespace - -//------------------------------------------------------------------------------ -// PyAffineExpr and subclasses. -//------------------------------------------------------------------------------ - -namespace { -/// CRTP base class for Python MLIR affine expressions that subclass AffineExpr -/// and should be castable from it. Intermediate hierarchy classes can be -/// modeled by specifying BaseTy. -template -class PyConcreteAffineExpr : public BaseTy { -public: - // Derived classes must define statics for: - // IsAFunctionTy isaFunction - // const char *pyClassName - // and redefine bindDerived. - using ClassTy = py::class_; - using IsAFunctionTy = bool (*)(MlirAffineExpr); - - PyConcreteAffineExpr() = default; - PyConcreteAffineExpr(PyMlirContextRef contextRef, MlirAffineExpr affineExpr) - : BaseTy(std::move(contextRef), affineExpr) {} - PyConcreteAffineExpr(PyAffineExpr &orig) - : PyConcreteAffineExpr(orig.getContext(), castFrom(orig)) {} - - static MlirAffineExpr castFrom(PyAffineExpr &orig) { - if (!DerivedTy::isaFunction(orig)) { - auto origRepr = py::repr(py::cast(orig)).cast(); - throw SetPyError(PyExc_ValueError, - Twine("Cannot cast affine expression to ") + - DerivedTy::pyClassName + " (from " + origRepr + ")"); - } - return orig; - } - - static void bind(py::module &m) { - auto cls = ClassTy(m, DerivedTy::pyClassName); - cls.def(py::init()); - DerivedTy::bindDerived(cls); - } - - /// Implemented by derived classes to add methods to the Python subclass. - static void bindDerived(ClassTy &m) {} -}; - -class PyAffineConstantExpr : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAConstant; - static constexpr const char *pyClassName = "AffineConstantExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - static PyAffineConstantExpr get(intptr_t value, - DefaultingPyMlirContext context) { - MlirAffineExpr affineExpr = - mlirAffineConstantExprGet(context->get(), static_cast(value)); - return PyAffineConstantExpr(context->getRef(), affineExpr); - } - - static void bindDerived(ClassTy &c) { - c.def_static("get", &PyAffineConstantExpr::get, py::arg("value"), - py::arg("context") = py::none()); - c.def_property_readonly("value", [](PyAffineConstantExpr &self) { - return mlirAffineConstantExprGetValue(self); - }); - } -}; - -class PyAffineDimExpr : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsADim; - static constexpr const char *pyClassName = "AffineDimExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - static PyAffineDimExpr get(intptr_t pos, DefaultingPyMlirContext context) { - MlirAffineExpr affineExpr = mlirAffineDimExprGet(context->get(), pos); - return PyAffineDimExpr(context->getRef(), affineExpr); - } - - static void bindDerived(ClassTy &c) { - c.def_static("get", &PyAffineDimExpr::get, py::arg("position"), - py::arg("context") = py::none()); - c.def_property_readonly("position", [](PyAffineDimExpr &self) { - return mlirAffineDimExprGetPosition(self); - }); - } -}; - -class PyAffineSymbolExpr : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsASymbol; - static constexpr const char *pyClassName = "AffineSymbolExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - static PyAffineSymbolExpr get(intptr_t pos, DefaultingPyMlirContext context) { - MlirAffineExpr affineExpr = mlirAffineSymbolExprGet(context->get(), pos); - return PyAffineSymbolExpr(context->getRef(), affineExpr); - } - - static void bindDerived(ClassTy &c) { - c.def_static("get", &PyAffineSymbolExpr::get, py::arg("position"), - py::arg("context") = py::none()); - c.def_property_readonly("position", [](PyAffineSymbolExpr &self) { - return mlirAffineSymbolExprGetPosition(self); - }); - } -}; - -class PyAffineBinaryExpr : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsABinary; - static constexpr const char *pyClassName = "AffineBinaryExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - PyAffineExpr lhs() { - MlirAffineExpr lhsExpr = mlirAffineBinaryOpExprGetLHS(get()); - return PyAffineExpr(getContext(), lhsExpr); - } - - PyAffineExpr rhs() { - MlirAffineExpr rhsExpr = mlirAffineBinaryOpExprGetRHS(get()); - return PyAffineExpr(getContext(), rhsExpr); - } - - static void bindDerived(ClassTy &c) { - c.def_property_readonly("lhs", &PyAffineBinaryExpr::lhs); - c.def_property_readonly("rhs", &PyAffineBinaryExpr::rhs); - } -}; - -class PyAffineAddExpr - : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAAdd; - static constexpr const char *pyClassName = "AffineAddExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - static PyAffineAddExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { - MlirAffineExpr expr = mlirAffineAddExprGet(lhs, rhs); - return PyAffineAddExpr(lhs.getContext(), expr); - } - - static void bindDerived(ClassTy &c) { - c.def_static("get", &PyAffineAddExpr::get); - } -}; - -class PyAffineMulExpr - : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAMul; - static constexpr const char *pyClassName = "AffineMulExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - static PyAffineMulExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { - MlirAffineExpr expr = mlirAffineMulExprGet(lhs, rhs); - return PyAffineMulExpr(lhs.getContext(), expr); - } - - static void bindDerived(ClassTy &c) { - c.def_static("get", &PyAffineMulExpr::get); - } -}; - -class PyAffineModExpr - : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAMod; - static constexpr const char *pyClassName = "AffineModExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - static PyAffineModExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { - MlirAffineExpr expr = mlirAffineModExprGet(lhs, rhs); - return PyAffineModExpr(lhs.getContext(), expr); - } - - static void bindDerived(ClassTy &c) { - c.def_static("get", &PyAffineModExpr::get); - } -}; - -class PyAffineFloorDivExpr - : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsAFloorDiv; - static constexpr const char *pyClassName = "AffineFloorDivExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - static PyAffineFloorDivExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { - MlirAffineExpr expr = mlirAffineFloorDivExprGet(lhs, rhs); - return PyAffineFloorDivExpr(lhs.getContext(), expr); - } - - static void bindDerived(ClassTy &c) { - c.def_static("get", &PyAffineFloorDivExpr::get); - } -}; - -class PyAffineCeilDivExpr - : public PyConcreteAffineExpr { -public: - static constexpr IsAFunctionTy isaFunction = mlirAffineExprIsACeilDiv; - static constexpr const char *pyClassName = "AffineCeilDivExpr"; - using PyConcreteAffineExpr::PyConcreteAffineExpr; - - static PyAffineCeilDivExpr get(PyAffineExpr lhs, PyAffineExpr rhs) { - MlirAffineExpr expr = mlirAffineCeilDivExprGet(lhs, rhs); - return PyAffineCeilDivExpr(lhs.getContext(), expr); - } - - static void bindDerived(ClassTy &c) { - c.def_static("get", &PyAffineCeilDivExpr::get); - } -}; -} // namespace - -bool PyAffineExpr::operator==(const PyAffineExpr &other) { - return mlirAffineExprEqual(affineExpr, other.affineExpr); -} - -py::object PyAffineExpr::getCapsule() { - return py::reinterpret_steal( - mlirPythonAffineExprToCapsule(*this)); -} - -PyAffineExpr PyAffineExpr::createFromCapsule(py::object capsule) { - MlirAffineExpr rawAffineExpr = mlirPythonCapsuleToAffineExpr(capsule.ptr()); - if (mlirAffineExprIsNull(rawAffineExpr)) - throw py::error_already_set(); - return PyAffineExpr( - PyMlirContext::forContext(mlirAffineExprGetContext(rawAffineExpr)), - rawAffineExpr); -} - -//------------------------------------------------------------------------------ -// PyAffineMap and utilities. -//------------------------------------------------------------------------------ - -namespace { -/// A list of expressions contained in an affine map. Internally these are -/// stored as a consecutive array leading to inexpensive random access. Both -/// the map and the expression are owned by the context so we need not bother -/// with lifetime extension. -class PyAffineMapExprList - : public Sliceable { -public: - static constexpr const char *pyClassName = "AffineExprList"; - - PyAffineMapExprList(PyAffineMap map, intptr_t startIndex = 0, - intptr_t length = -1, intptr_t step = 1) - : Sliceable(startIndex, - length == -1 ? mlirAffineMapGetNumResults(map) : length, - step), - affineMap(map) {} - - intptr_t getNumElements() { return mlirAffineMapGetNumResults(affineMap); } - - PyAffineExpr getElement(intptr_t pos) { - return PyAffineExpr(affineMap.getContext(), - mlirAffineMapGetResult(affineMap, pos)); - } - - PyAffineMapExprList slice(intptr_t startIndex, intptr_t length, - intptr_t step) { - return PyAffineMapExprList(affineMap, startIndex, length, step); - } - -private: - PyAffineMap affineMap; -}; -} // end namespace - -bool PyAffineMap::operator==(const PyAffineMap &other) { - return mlirAffineMapEqual(affineMap, other.affineMap); -} - -py::object PyAffineMap::getCapsule() { - return py::reinterpret_steal(mlirPythonAffineMapToCapsule(*this)); -} - -PyAffineMap PyAffineMap::createFromCapsule(py::object capsule) { - MlirAffineMap rawAffineMap = mlirPythonCapsuleToAffineMap(capsule.ptr()); - if (mlirAffineMapIsNull(rawAffineMap)) - throw py::error_already_set(); - return PyAffineMap( - PyMlirContext::forContext(mlirAffineMapGetContext(rawAffineMap)), - rawAffineMap); -} - -//------------------------------------------------------------------------------ -// PyIntegerSet and utilities. -//------------------------------------------------------------------------------ - -class PyIntegerSetConstraint { -public: - PyIntegerSetConstraint(PyIntegerSet set, intptr_t pos) : set(set), pos(pos) {} - - PyAffineExpr getExpr() { - return PyAffineExpr(set.getContext(), - mlirIntegerSetGetConstraint(set, pos)); - } - - bool isEq() { return mlirIntegerSetIsConstraintEq(set, pos); } - - static void bind(py::module &m) { - py::class_(m, "IntegerSetConstraint") - .def_property_readonly("expr", &PyIntegerSetConstraint::getExpr) - .def_property_readonly("is_eq", &PyIntegerSetConstraint::isEq); - } - -private: - PyIntegerSet set; - intptr_t pos; -}; - -class PyIntegerSetConstraintList - : public Sliceable { -public: - static constexpr const char *pyClassName = "IntegerSetConstraintList"; - - PyIntegerSetConstraintList(PyIntegerSet set, intptr_t startIndex = 0, - intptr_t length = -1, intptr_t step = 1) - : Sliceable(startIndex, - length == -1 ? mlirIntegerSetGetNumConstraints(set) : length, - step), - set(set) {} - - intptr_t getNumElements() { return mlirIntegerSetGetNumConstraints(set); } - - PyIntegerSetConstraint getElement(intptr_t pos) { - return PyIntegerSetConstraint(set, pos); - } - - PyIntegerSetConstraintList slice(intptr_t startIndex, intptr_t length, - intptr_t step) { - return PyIntegerSetConstraintList(set, startIndex, length, step); - } - -private: - PyIntegerSet set; -}; - -bool PyIntegerSet::operator==(const PyIntegerSet &other) { - return mlirIntegerSetEqual(integerSet, other.integerSet); -} - -py::object PyIntegerSet::getCapsule() { - return py::reinterpret_steal( - mlirPythonIntegerSetToCapsule(*this)); -} - -PyIntegerSet PyIntegerSet::createFromCapsule(py::object capsule) { - MlirIntegerSet rawIntegerSet = mlirPythonCapsuleToIntegerSet(capsule.ptr()); - if (mlirIntegerSetIsNull(rawIntegerSet)) - throw py::error_already_set(); - return PyIntegerSet( - PyMlirContext::forContext(mlirIntegerSetGetContext(rawIntegerSet)), - rawIntegerSet); -} - -/// Attempts to populate `result` with the content of `list` casted to the -/// appropriate type (Python and C types are provided as template arguments). -/// Throws errors in case of failure, using "action" to describe what the caller -/// was attempting to do. -template -static void pyListToVector(py::list list, llvm::SmallVectorImpl &result, - StringRef action) { - result.reserve(py::len(list)); - for (py::handle item : list) { - try { - result.push_back(item.cast()); - } catch (py::cast_error &err) { - std::string msg = (llvm::Twine("Invalid expression when ") + action + - " (" + err.what() + ")") - .str(); - throw py::cast_error(msg); - } catch (py::reference_cast_error &err) { - std::string msg = (llvm::Twine("Invalid expression (None?) when ") + - action + " (" + err.what() + ")") - .str(); - throw py::cast_error(msg); - } - } -} - -//------------------------------------------------------------------------------ -// Populates the pybind11 IR submodule. -//------------------------------------------------------------------------------ - -void mlir::python::populateIRSubmodule(py::module &m) { - //---------------------------------------------------------------------------- - // Mapping of MlirContext - //---------------------------------------------------------------------------- - py::class_(m, "Context") - .def(py::init<>(&PyMlirContext::createNewContextForInit)) - .def_static("_get_live_count", &PyMlirContext::getLiveCount) - .def("_get_context_again", - [](PyMlirContext &self) { - PyMlirContextRef ref = PyMlirContext::forContext(self.get()); - return ref.releaseObject(); - }) - .def("_get_live_operation_count", &PyMlirContext::getLiveOperationCount) - .def("_get_live_module_count", &PyMlirContext::getLiveModuleCount) - .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, - &PyMlirContext::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyMlirContext::createFromCapsule) - .def("__enter__", &PyMlirContext::contextEnter) - .def("__exit__", &PyMlirContext::contextExit) - .def_property_readonly_static( - "current", - [](py::object & /*class*/) { - auto *context = PyThreadContextEntry::getDefaultContext(); - if (!context) - throw SetPyError(PyExc_ValueError, "No current Context"); - return context; - }, - "Gets the Context bound to the current thread or raises ValueError") - .def_property_readonly( - "dialects", - [](PyMlirContext &self) { return PyDialects(self.getRef()); }, - "Gets a container for accessing dialects by name") - .def_property_readonly( - "d", [](PyMlirContext &self) { return PyDialects(self.getRef()); }, - "Alias for 'dialect'") - .def( - "get_dialect_descriptor", - [=](PyMlirContext &self, std::string &name) { - MlirDialect dialect = mlirContextGetOrLoadDialect( - self.get(), {name.data(), name.size()}); - if (mlirDialectIsNull(dialect)) { - throw SetPyError(PyExc_ValueError, - Twine("Dialect '") + name + "' not found"); - } - return PyDialectDescriptor(self.getRef(), dialect); - }, - "Gets or loads a dialect by name, returning its descriptor object") - .def_property( - "allow_unregistered_dialects", - [](PyMlirContext &self) -> bool { - return mlirContextGetAllowUnregisteredDialects(self.get()); - }, - [](PyMlirContext &self, bool value) { - mlirContextSetAllowUnregisteredDialects(self.get(), value); - }); - - //---------------------------------------------------------------------------- - // Mapping of PyDialectDescriptor - //---------------------------------------------------------------------------- - py::class_(m, "DialectDescriptor") - .def_property_readonly("namespace", - [](PyDialectDescriptor &self) { - MlirStringRef ns = - mlirDialectGetNamespace(self.get()); - return py::str(ns.data, ns.length); - }) - .def("__repr__", [](PyDialectDescriptor &self) { - MlirStringRef ns = mlirDialectGetNamespace(self.get()); - std::string repr(""); - return repr; - }); - - //---------------------------------------------------------------------------- - // Mapping of PyDialects - //---------------------------------------------------------------------------- - py::class_(m, "Dialects") - .def("__getitem__", - [=](PyDialects &self, std::string keyName) { - MlirDialect dialect = - self.getDialectForKey(keyName, /*attrError=*/false); - py::object descriptor = - py::cast(PyDialectDescriptor{self.getContext(), dialect}); - return createCustomDialectWrapper(keyName, std::move(descriptor)); - }) - .def("__getattr__", [=](PyDialects &self, std::string attrName) { - MlirDialect dialect = - self.getDialectForKey(attrName, /*attrError=*/true); - py::object descriptor = - py::cast(PyDialectDescriptor{self.getContext(), dialect}); - return createCustomDialectWrapper(attrName, std::move(descriptor)); - }); - - //---------------------------------------------------------------------------- - // Mapping of PyDialect - //---------------------------------------------------------------------------- - py::class_(m, "Dialect") - .def(py::init(), "descriptor") - .def_property_readonly( - "descriptor", [](PyDialect &self) { return self.getDescriptor(); }) - .def("__repr__", [](py::object self) { - auto clazz = self.attr("__class__"); - return py::str(""); - }); - - //---------------------------------------------------------------------------- - // Mapping of Location - //---------------------------------------------------------------------------- - py::class_(m, "Location") - .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule) - .def("__enter__", &PyLocation::contextEnter) - .def("__exit__", &PyLocation::contextExit) - .def("__eq__", - [](PyLocation &self, PyLocation &other) -> bool { - return mlirLocationEqual(self, other); - }) - .def("__eq__", [](PyLocation &self, py::object other) { return false; }) - .def_property_readonly_static( - "current", - [](py::object & /*class*/) { - auto *loc = PyThreadContextEntry::getDefaultLocation(); - if (!loc) - throw SetPyError(PyExc_ValueError, "No current Location"); - return loc; - }, - "Gets the Location bound to the current thread or raises ValueError") - .def_static( - "unknown", - [](DefaultingPyMlirContext context) { - return PyLocation(context->getRef(), - mlirLocationUnknownGet(context->get())); - }, - py::arg("context") = py::none(), - "Gets a Location representing an unknown location") - .def_static( - "file", - [](std::string filename, int line, int col, - DefaultingPyMlirContext context) { - return PyLocation( - context->getRef(), - mlirLocationFileLineColGet( - context->get(), toMlirStringRef(filename), line, col)); - }, - py::arg("filename"), py::arg("line"), py::arg("col"), - py::arg("context") = py::none(), kContextGetFileLocationDocstring) - .def_property_readonly( - "context", - [](PyLocation &self) { return self.getContext().getObject(); }, - "Context that owns the Location") - .def("__repr__", [](PyLocation &self) { - PyPrintAccumulator printAccum; - mlirLocationPrint(self, printAccum.getCallback(), - printAccum.getUserData()); - return printAccum.join(); - }); + //---------------------------------------------------------------------------- + // Mapping of Location + //---------------------------------------------------------------------------- + py::class_(m, "Location") + .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, &PyLocation::getCapsule) + .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyLocation::createFromCapsule) + .def("__enter__", &PyLocation::contextEnter) + .def("__exit__", &PyLocation::contextExit) + .def("__eq__", + [](PyLocation &self, PyLocation &other) -> bool { + return mlirLocationEqual(self, other); + }) + .def("__eq__", [](PyLocation &self, py::object other) { return false; }) + .def_property_readonly_static( + "current", + [](py::object & /*class*/) { + auto *loc = PyThreadContextEntry::getDefaultLocation(); + if (!loc) + throw SetPyError(PyExc_ValueError, "No current Location"); + return loc; + }, + "Gets the Location bound to the current thread or raises ValueError") + .def_static( + "unknown", + [](DefaultingPyMlirContext context) { + return PyLocation(context->getRef(), + mlirLocationUnknownGet(context->get())); + }, + py::arg("context") = py::none(), + "Gets a Location representing an unknown location") + .def_static( + "file", + [](std::string filename, int line, int col, + DefaultingPyMlirContext context) { + return PyLocation( + context->getRef(), + mlirLocationFileLineColGet( + context->get(), toMlirStringRef(filename), line, col)); + }, + py::arg("filename"), py::arg("line"), py::arg("col"), + py::arg("context") = py::none(), kContextGetFileLocationDocstring) + .def_property_readonly( + "context", + [](PyLocation &self) { return self.getContext().getObject(); }, + "Context that owns the Location") + .def("__repr__", [](PyLocation &self) { + PyPrintAccumulator printAccum; + mlirLocationPrint(self, printAccum.getCallback(), + printAccum.getUserData()); + return printAccum.join(); + }); //---------------------------------------------------------------------------- // Mapping of Module @@ -4022,22 +2259,6 @@ void mlir::python::populateIRSubmodule(py::module &m) { py::keep_alive<0, 1>(), "The underlying generic attribute of the NamedAttribute binding"); - // Builtin attribute bindings. - PyAffineMapAttribute::bind(m); - PyArrayAttribute::bind(m); - PyArrayAttribute::PyArrayAttributeIterator::bind(m); - PyBoolAttribute::bind(m); - PyDenseElementsAttribute::bind(m); - PyDenseFPElementsAttribute::bind(m); - PyDenseIntElementsAttribute::bind(m); - PyDictAttribute::bind(m); - PyFlatSymbolRefAttribute::bind(m); - PyFloatAttribute::bind(m); - PyIntegerAttribute::bind(m); - PyStringAttribute::bind(m); - PyTypeAttribute::bind(m); - PyUnitAttribute::bind(m); - //---------------------------------------------------------------------------- // Mapping of PyType. //---------------------------------------------------------------------------- @@ -4088,25 +2309,6 @@ void mlir::python::populateIRSubmodule(py::module &m) { return printAccum.join(); }); - // Builtin type bindings. - PyIntegerType::bind(m); - PyIndexType::bind(m); - PyBF16Type::bind(m); - PyF16Type::bind(m); - PyF32Type::bind(m); - PyF64Type::bind(m); - PyNoneType::bind(m); - PyComplexType::bind(m); - PyShapedType::bind(m); - PyVectorType::bind(m); - PyRankedTensorType::bind(m); - PyUnrankedTensorType::bind(m); - PyMemRefType::bind(m); - PyMemRefLayoutMapList::bind(m); - PyUnrankedMemRefType::bind(m); - PyTupleType::bind(m); - PyFunctionType::bind(m); - //---------------------------------------------------------------------------- // Mapping of Value. //---------------------------------------------------------------------------- @@ -4152,359 +2354,4 @@ void mlir::python::populateIRSubmodule(py::module &m) { PyOpResultList::bind(m); PyRegionIterator::bind(m); PyRegionList::bind(m); - - //---------------------------------------------------------------------------- - // Mapping of PyAffineExpr and derived classes. - //---------------------------------------------------------------------------- - py::class_(m, "AffineExpr") - .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, - &PyAffineExpr::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyAffineExpr::createFromCapsule) - .def("__add__", - [](PyAffineExpr &self, PyAffineExpr &other) { - return PyAffineAddExpr::get(self, other); - }) - .def("__mul__", - [](PyAffineExpr &self, PyAffineExpr &other) { - return PyAffineMulExpr::get(self, other); - }) - .def("__mod__", - [](PyAffineExpr &self, PyAffineExpr &other) { - return PyAffineModExpr::get(self, other); - }) - .def("__sub__", - [](PyAffineExpr &self, PyAffineExpr &other) { - auto negOne = - PyAffineConstantExpr::get(-1, *self.getContext().get()); - return PyAffineAddExpr::get(self, - PyAffineMulExpr::get(negOne, other)); - }) - .def("__eq__", [](PyAffineExpr &self, - PyAffineExpr &other) { return self == other; }) - .def("__eq__", - [](PyAffineExpr &self, py::object &other) { return false; }) - .def("__str__", - [](PyAffineExpr &self) { - PyPrintAccumulator printAccum; - mlirAffineExprPrint(self, printAccum.getCallback(), - printAccum.getUserData()); - return printAccum.join(); - }) - .def("__repr__", - [](PyAffineExpr &self) { - PyPrintAccumulator printAccum; - printAccum.parts.append("AffineExpr("); - mlirAffineExprPrint(self, printAccum.getCallback(), - printAccum.getUserData()); - printAccum.parts.append(")"); - return printAccum.join(); - }) - .def_property_readonly( - "context", - [](PyAffineExpr &self) { return self.getContext().getObject(); }) - .def_static( - "get_add", &PyAffineAddExpr::get, - "Gets an affine expression containing a sum of two expressions.") - .def_static( - "get_mul", &PyAffineMulExpr::get, - "Gets an affine expression containing a product of two expressions.") - .def_static("get_mod", &PyAffineModExpr::get, - "Gets an affine expression containing the modulo of dividing " - "one expression by another.") - .def_static("get_floor_div", &PyAffineFloorDivExpr::get, - "Gets an affine expression containing the rounded-down " - "result of dividing one expression by another.") - .def_static("get_ceil_div", &PyAffineCeilDivExpr::get, - "Gets an affine expression containing the rounded-up result " - "of dividing one expression by another.") - .def_static("get_constant", &PyAffineConstantExpr::get, py::arg("value"), - py::arg("context") = py::none(), - "Gets a constant affine expression with the given value.") - .def_static( - "get_dim", &PyAffineDimExpr::get, py::arg("position"), - py::arg("context") = py::none(), - "Gets an affine expression of a dimension at the given position.") - .def_static( - "get_symbol", &PyAffineSymbolExpr::get, py::arg("position"), - py::arg("context") = py::none(), - "Gets an affine expression of a symbol at the given position.") - .def( - "dump", [](PyAffineExpr &self) { mlirAffineExprDump(self); }, - kDumpDocstring); - PyAffineConstantExpr::bind(m); - PyAffineDimExpr::bind(m); - PyAffineSymbolExpr::bind(m); - PyAffineBinaryExpr::bind(m); - PyAffineAddExpr::bind(m); - PyAffineMulExpr::bind(m); - PyAffineModExpr::bind(m); - PyAffineFloorDivExpr::bind(m); - PyAffineCeilDivExpr::bind(m); - - //---------------------------------------------------------------------------- - // Mapping of PyAffineMap. - //---------------------------------------------------------------------------- - py::class_(m, "AffineMap") - .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, - &PyAffineMap::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyAffineMap::createFromCapsule) - .def("__eq__", - [](PyAffineMap &self, PyAffineMap &other) { return self == other; }) - .def("__eq__", [](PyAffineMap &self, py::object &other) { return false; }) - .def("__str__", - [](PyAffineMap &self) { - PyPrintAccumulator printAccum; - mlirAffineMapPrint(self, printAccum.getCallback(), - printAccum.getUserData()); - return printAccum.join(); - }) - .def("__repr__", - [](PyAffineMap &self) { - PyPrintAccumulator printAccum; - printAccum.parts.append("AffineMap("); - mlirAffineMapPrint(self, printAccum.getCallback(), - printAccum.getUserData()); - printAccum.parts.append(")"); - return printAccum.join(); - }) - .def_property_readonly( - "context", - [](PyAffineMap &self) { return self.getContext().getObject(); }, - "Context that owns the Affine Map") - .def( - "dump", [](PyAffineMap &self) { mlirAffineMapDump(self); }, - kDumpDocstring) - .def_static( - "get", - [](intptr_t dimCount, intptr_t symbolCount, py::list exprs, - DefaultingPyMlirContext context) { - SmallVector affineExprs; - pyListToVector( - exprs, affineExprs, "attempting to create an AffineMap"); - MlirAffineMap map = - mlirAffineMapGet(context->get(), dimCount, symbolCount, - affineExprs.size(), affineExprs.data()); - return PyAffineMap(context->getRef(), map); - }, - py::arg("dim_count"), py::arg("symbol_count"), py::arg("exprs"), - py::arg("context") = py::none(), - "Gets a map with the given expressions as results.") - .def_static( - "get_constant", - [](intptr_t value, DefaultingPyMlirContext context) { - MlirAffineMap affineMap = - mlirAffineMapConstantGet(context->get(), value); - return PyAffineMap(context->getRef(), affineMap); - }, - py::arg("value"), py::arg("context") = py::none(), - "Gets an affine map with a single constant result") - .def_static( - "get_empty", - [](DefaultingPyMlirContext context) { - MlirAffineMap affineMap = mlirAffineMapEmptyGet(context->get()); - return PyAffineMap(context->getRef(), affineMap); - }, - py::arg("context") = py::none(), "Gets an empty affine map.") - .def_static( - "get_identity", - [](intptr_t nDims, DefaultingPyMlirContext context) { - MlirAffineMap affineMap = - mlirAffineMapMultiDimIdentityGet(context->get(), nDims); - return PyAffineMap(context->getRef(), affineMap); - }, - py::arg("n_dims"), py::arg("context") = py::none(), - "Gets an identity map with the given number of dimensions.") - .def_static( - "get_minor_identity", - [](intptr_t nDims, intptr_t nResults, - DefaultingPyMlirContext context) { - MlirAffineMap affineMap = - mlirAffineMapMinorIdentityGet(context->get(), nDims, nResults); - return PyAffineMap(context->getRef(), affineMap); - }, - py::arg("n_dims"), py::arg("n_results"), - py::arg("context") = py::none(), - "Gets a minor identity map with the given number of dimensions and " - "results.") - .def_static( - "get_permutation", - [](std::vector permutation, - DefaultingPyMlirContext context) { - if (!isPermutation(permutation)) - throw py::cast_error("Invalid permutation when attempting to " - "create an AffineMap"); - MlirAffineMap affineMap = mlirAffineMapPermutationGet( - context->get(), permutation.size(), permutation.data()); - return PyAffineMap(context->getRef(), affineMap); - }, - py::arg("permutation"), py::arg("context") = py::none(), - "Gets an affine map that permutes its inputs.") - .def("get_submap", - [](PyAffineMap &self, std::vector &resultPos) { - intptr_t numResults = mlirAffineMapGetNumResults(self); - for (intptr_t pos : resultPos) { - if (pos < 0 || pos >= numResults) - throw py::value_error("result position out of bounds"); - } - MlirAffineMap affineMap = mlirAffineMapGetSubMap( - self, resultPos.size(), resultPos.data()); - return PyAffineMap(self.getContext(), affineMap); - }) - .def("get_major_submap", - [](PyAffineMap &self, intptr_t nResults) { - if (nResults >= mlirAffineMapGetNumResults(self)) - throw py::value_error("number of results out of bounds"); - MlirAffineMap affineMap = - mlirAffineMapGetMajorSubMap(self, nResults); - return PyAffineMap(self.getContext(), affineMap); - }) - .def("get_minor_submap", - [](PyAffineMap &self, intptr_t nResults) { - if (nResults >= mlirAffineMapGetNumResults(self)) - throw py::value_error("number of results out of bounds"); - MlirAffineMap affineMap = - mlirAffineMapGetMinorSubMap(self, nResults); - return PyAffineMap(self.getContext(), affineMap); - }) - .def_property_readonly( - "is_permutation", - [](PyAffineMap &self) { return mlirAffineMapIsPermutation(self); }) - .def_property_readonly("is_projected_permutation", - [](PyAffineMap &self) { - return mlirAffineMapIsProjectedPermutation(self); - }) - .def_property_readonly( - "n_dims", - [](PyAffineMap &self) { return mlirAffineMapGetNumDims(self); }) - .def_property_readonly( - "n_inputs", - [](PyAffineMap &self) { return mlirAffineMapGetNumInputs(self); }) - .def_property_readonly( - "n_symbols", - [](PyAffineMap &self) { return mlirAffineMapGetNumSymbols(self); }) - .def_property_readonly("results", [](PyAffineMap &self) { - return PyAffineMapExprList(self); - }); - PyAffineMapExprList::bind(m); - - //---------------------------------------------------------------------------- - // Mapping of PyIntegerSet. - //---------------------------------------------------------------------------- - py::class_(m, "IntegerSet") - .def_property_readonly(MLIR_PYTHON_CAPI_PTR_ATTR, - &PyIntegerSet::getCapsule) - .def(MLIR_PYTHON_CAPI_FACTORY_ATTR, &PyIntegerSet::createFromCapsule) - .def("__eq__", [](PyIntegerSet &self, - PyIntegerSet &other) { return self == other; }) - .def("__eq__", [](PyIntegerSet &self, py::object other) { return false; }) - .def("__str__", - [](PyIntegerSet &self) { - PyPrintAccumulator printAccum; - mlirIntegerSetPrint(self, printAccum.getCallback(), - printAccum.getUserData()); - return printAccum.join(); - }) - .def("__repr__", - [](PyIntegerSet &self) { - PyPrintAccumulator printAccum; - printAccum.parts.append("IntegerSet("); - mlirIntegerSetPrint(self, printAccum.getCallback(), - printAccum.getUserData()); - printAccum.parts.append(")"); - return printAccum.join(); - }) - .def_property_readonly( - "context", - [](PyIntegerSet &self) { return self.getContext().getObject(); }) - .def( - "dump", [](PyIntegerSet &self) { mlirIntegerSetDump(self); }, - kDumpDocstring) - .def_static( - "get", - [](intptr_t numDims, intptr_t numSymbols, py::list exprs, - std::vector eqFlags, DefaultingPyMlirContext context) { - if (exprs.size() != eqFlags.size()) - throw py::value_error( - "Expected the number of constraints to match " - "that of equality flags"); - if (exprs.empty()) - throw py::value_error("Expected non-empty list of constraints"); - - // Copy over to a SmallVector because std::vector has a - // specialization for booleans that packs data and does not - // expose a `bool *`. - SmallVector flags(eqFlags.begin(), eqFlags.end()); - - SmallVector affineExprs; - pyListToVector(exprs, affineExprs, - "attempting to create an IntegerSet"); - MlirIntegerSet set = mlirIntegerSetGet( - context->get(), numDims, numSymbols, exprs.size(), - affineExprs.data(), flags.data()); - return PyIntegerSet(context->getRef(), set); - }, - py::arg("num_dims"), py::arg("num_symbols"), py::arg("exprs"), - py::arg("eq_flags"), py::arg("context") = py::none()) - .def_static( - "get_empty", - [](intptr_t numDims, intptr_t numSymbols, - DefaultingPyMlirContext context) { - MlirIntegerSet set = - mlirIntegerSetEmptyGet(context->get(), numDims, numSymbols); - return PyIntegerSet(context->getRef(), set); - }, - py::arg("num_dims"), py::arg("num_symbols"), - py::arg("context") = py::none()) - .def("get_replaced", - [](PyIntegerSet &self, py::list dimExprs, py::list symbolExprs, - intptr_t numResultDims, intptr_t numResultSymbols) { - if (static_cast(dimExprs.size()) != - mlirIntegerSetGetNumDims(self)) - throw py::value_error( - "Expected the number of dimension replacement expressions " - "to match that of dimensions"); - if (static_cast(symbolExprs.size()) != - mlirIntegerSetGetNumSymbols(self)) - throw py::value_error( - "Expected the number of symbol replacement expressions " - "to match that of symbols"); - - SmallVector dimAffineExprs, symbolAffineExprs; - pyListToVector( - dimExprs, dimAffineExprs, - "attempting to create an IntegerSet by replacing dimensions"); - pyListToVector( - symbolExprs, symbolAffineExprs, - "attempting to create an IntegerSet by replacing symbols"); - MlirIntegerSet set = mlirIntegerSetReplaceGet( - self, dimAffineExprs.data(), symbolAffineExprs.data(), - numResultDims, numResultSymbols); - return PyIntegerSet(self.getContext(), set); - }) - .def_property_readonly("is_canonical_empty", - [](PyIntegerSet &self) { - return mlirIntegerSetIsCanonicalEmpty(self); - }) - .def_property_readonly( - "n_dims", - [](PyIntegerSet &self) { return mlirIntegerSetGetNumDims(self); }) - .def_property_readonly( - "n_symbols", - [](PyIntegerSet &self) { return mlirIntegerSetGetNumSymbols(self); }) - .def_property_readonly( - "n_inputs", - [](PyIntegerSet &self) { return mlirIntegerSetGetNumInputs(self); }) - .def_property_readonly("n_equalities", - [](PyIntegerSet &self) { - return mlirIntegerSetGetNumEqualities(self); - }) - .def_property_readonly("n_inequalities", - [](PyIntegerSet &self) { - return mlirIntegerSetGetNumInequalities(self); - }) - .def_property_readonly("constraints", [](PyIntegerSet &self) { - return PyIntegerSetConstraintList(self); - }); - PyIntegerSetConstraint::bind(m); - PyIntegerSetConstraintList::bind(m); } diff --git a/mlir/lib/Bindings/Python/IRModules.h b/mlir/lib/Bindings/Python/IRModule.h similarity index 99% rename from mlir/lib/Bindings/Python/IRModules.h rename to mlir/lib/Bindings/Python/IRModule.h index 8140d704300d..5c710abe789a 100644 --- a/mlir/lib/Bindings/Python/IRModules.h +++ b/mlir/lib/Bindings/Python/IRModule.h @@ -747,7 +747,10 @@ private: MlirIntegerSet integerSet; }; -void populateIRSubmodule(pybind11::module &m); +void populateIRAffine(pybind11::module &m); +void populateIRAttributes(pybind11::module &m); +void populateIRCore(pybind11::module &m); +void populateIRTypes(pybind11::module &m); } // namespace python } // namespace mlir diff --git a/mlir/lib/Bindings/Python/IRTypes.cpp b/mlir/lib/Bindings/Python/IRTypes.cpp new file mode 100644 index 000000000000..96f6bf6666c9 --- /dev/null +++ b/mlir/lib/Bindings/Python/IRTypes.cpp @@ -0,0 +1,678 @@ +//===- IRTypes.cpp - Exports builtin and standard types -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "IRModule.h" + +#include "PybindUtils.h" + +#include "mlir-c/BuiltinTypes.h" + +namespace py = pybind11; +using namespace mlir; +using namespace mlir::python; + +using llvm::SmallVector; +using llvm::Twine; + +namespace { + +/// Checks whether the given type is an integer or float type. +static int mlirTypeIsAIntegerOrFloat(MlirType type) { + return mlirTypeIsAInteger(type) || mlirTypeIsABF16(type) || + mlirTypeIsAF16(type) || mlirTypeIsAF32(type) || mlirTypeIsAF64(type); +} + +/// CRTP base classes for Python types that subclass Type and should be +/// castable from it (i.e. via something like IntegerType(t)). +/// By default, type class hierarchies are one level deep (i.e. a +/// concrete type class extends PyType); however, intermediate python-visible +/// base classes can be modeled by specifying a BaseTy. +template +class PyConcreteType : public BaseTy { +public: + // Derived classes must define statics for: + // IsAFunctionTy isaFunction + // const char *pyClassName + using ClassTy = py::class_; + using IsAFunctionTy = bool (*)(MlirType); + + PyConcreteType() = default; + PyConcreteType(PyMlirContextRef contextRef, MlirType t) + : BaseTy(std::move(contextRef), t) {} + PyConcreteType(PyType &orig) + : PyConcreteType(orig.getContext(), castFrom(orig)) {} + + static MlirType castFrom(PyType &orig) { + if (!DerivedTy::isaFunction(orig)) { + auto origRepr = py::repr(py::cast(orig)).cast(); + throw SetPyError(PyExc_ValueError, Twine("Cannot cast type to ") + + DerivedTy::pyClassName + + " (from " + origRepr + ")"); + } + return orig; + } + + static void bind(py::module &m) { + auto cls = ClassTy(m, DerivedTy::pyClassName); + cls.def(py::init(), py::keep_alive<0, 1>()); + cls.def_static("isinstance", [](PyType &otherType) -> bool { + return DerivedTy::isaFunction(otherType); + }); + DerivedTy::bindDerived(cls); + } + + /// Implemented by derived classes to add methods to the Python subclass. + static void bindDerived(ClassTy &m) {} +}; + +class PyIntegerType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAInteger; + static constexpr const char *pyClassName = "IntegerType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get_signless", + [](unsigned width, DefaultingPyMlirContext context) { + MlirType t = mlirIntegerTypeGet(context->get(), width); + return PyIntegerType(context->getRef(), t); + }, + py::arg("width"), py::arg("context") = py::none(), + "Create a signless integer type"); + c.def_static( + "get_signed", + [](unsigned width, DefaultingPyMlirContext context) { + MlirType t = mlirIntegerTypeSignedGet(context->get(), width); + return PyIntegerType(context->getRef(), t); + }, + py::arg("width"), py::arg("context") = py::none(), + "Create a signed integer type"); + c.def_static( + "get_unsigned", + [](unsigned width, DefaultingPyMlirContext context) { + MlirType t = mlirIntegerTypeUnsignedGet(context->get(), width); + return PyIntegerType(context->getRef(), t); + }, + py::arg("width"), py::arg("context") = py::none(), + "Create an unsigned integer type"); + c.def_property_readonly( + "width", + [](PyIntegerType &self) { return mlirIntegerTypeGetWidth(self); }, + "Returns the width of the integer type"); + c.def_property_readonly( + "is_signless", + [](PyIntegerType &self) -> bool { + return mlirIntegerTypeIsSignless(self); + }, + "Returns whether this is a signless integer"); + c.def_property_readonly( + "is_signed", + [](PyIntegerType &self) -> bool { + return mlirIntegerTypeIsSigned(self); + }, + "Returns whether this is a signed integer"); + c.def_property_readonly( + "is_unsigned", + [](PyIntegerType &self) -> bool { + return mlirIntegerTypeIsUnsigned(self); + }, + "Returns whether this is an unsigned integer"); + } +}; + +/// Index Type subclass - IndexType. +class PyIndexType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAIndex; + static constexpr const char *pyClassName = "IndexType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](DefaultingPyMlirContext context) { + MlirType t = mlirIndexTypeGet(context->get()); + return PyIndexType(context->getRef(), t); + }, + py::arg("context") = py::none(), "Create a index type."); + } +}; + +/// Floating Point Type subclass - BF16Type. +class PyBF16Type : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsABF16; + static constexpr const char *pyClassName = "BF16Type"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](DefaultingPyMlirContext context) { + MlirType t = mlirBF16TypeGet(context->get()); + return PyBF16Type(context->getRef(), t); + }, + py::arg("context") = py::none(), "Create a bf16 type."); + } +}; + +/// Floating Point Type subclass - F16Type. +class PyF16Type : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAF16; + static constexpr const char *pyClassName = "F16Type"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](DefaultingPyMlirContext context) { + MlirType t = mlirF16TypeGet(context->get()); + return PyF16Type(context->getRef(), t); + }, + py::arg("context") = py::none(), "Create a f16 type."); + } +}; + +/// Floating Point Type subclass - F32Type. +class PyF32Type : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAF32; + static constexpr const char *pyClassName = "F32Type"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](DefaultingPyMlirContext context) { + MlirType t = mlirF32TypeGet(context->get()); + return PyF32Type(context->getRef(), t); + }, + py::arg("context") = py::none(), "Create a f32 type."); + } +}; + +/// Floating Point Type subclass - F64Type. +class PyF64Type : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAF64; + static constexpr const char *pyClassName = "F64Type"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](DefaultingPyMlirContext context) { + MlirType t = mlirF64TypeGet(context->get()); + return PyF64Type(context->getRef(), t); + }, + py::arg("context") = py::none(), "Create a f64 type."); + } +}; + +/// None Type subclass - NoneType. +class PyNoneType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsANone; + static constexpr const char *pyClassName = "NoneType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](DefaultingPyMlirContext context) { + MlirType t = mlirNoneTypeGet(context->get()); + return PyNoneType(context->getRef(), t); + }, + py::arg("context") = py::none(), "Create a none type."); + } +}; + +/// Complex Type subclass - ComplexType. +class PyComplexType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAComplex; + static constexpr const char *pyClassName = "ComplexType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](PyType &elementType) { + // The element must be a floating point or integer scalar type. + if (mlirTypeIsAIntegerOrFloat(elementType)) { + MlirType t = mlirComplexTypeGet(elementType); + return PyComplexType(elementType.getContext(), t); + } + throw SetPyError( + PyExc_ValueError, + Twine("invalid '") + + py::repr(py::cast(elementType)).cast() + + "' and expected floating point or integer type."); + }, + "Create a complex type"); + c.def_property_readonly( + "element_type", + [](PyComplexType &self) -> PyType { + MlirType t = mlirComplexTypeGetElementType(self); + return PyType(self.getContext(), t); + }, + "Returns element type."); + } +}; + +class PyShapedType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAShaped; + static constexpr const char *pyClassName = "ShapedType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_property_readonly( + "element_type", + [](PyShapedType &self) { + MlirType t = mlirShapedTypeGetElementType(self); + return PyType(self.getContext(), t); + }, + "Returns the element type of the shaped type."); + c.def_property_readonly( + "has_rank", + [](PyShapedType &self) -> bool { return mlirShapedTypeHasRank(self); }, + "Returns whether the given shaped type is ranked."); + c.def_property_readonly( + "rank", + [](PyShapedType &self) { + self.requireHasRank(); + return mlirShapedTypeGetRank(self); + }, + "Returns the rank of the given ranked shaped type."); + c.def_property_readonly( + "has_static_shape", + [](PyShapedType &self) -> bool { + return mlirShapedTypeHasStaticShape(self); + }, + "Returns whether the given shaped type has a static shape."); + c.def( + "is_dynamic_dim", + [](PyShapedType &self, intptr_t dim) -> bool { + self.requireHasRank(); + return mlirShapedTypeIsDynamicDim(self, dim); + }, + "Returns whether the dim-th dimension of the given shaped type is " + "dynamic."); + c.def( + "get_dim_size", + [](PyShapedType &self, intptr_t dim) { + self.requireHasRank(); + return mlirShapedTypeGetDimSize(self, dim); + }, + "Returns the dim-th dimension of the given ranked shaped type."); + c.def_static( + "is_dynamic_size", + [](int64_t size) -> bool { return mlirShapedTypeIsDynamicSize(size); }, + "Returns whether the given dimension size indicates a dynamic " + "dimension."); + c.def( + "is_dynamic_stride_or_offset", + [](PyShapedType &self, int64_t val) -> bool { + self.requireHasRank(); + return mlirShapedTypeIsDynamicStrideOrOffset(val); + }, + "Returns whether the given value is used as a placeholder for dynamic " + "strides and offsets in shaped types."); + } + +private: + void requireHasRank() { + if (!mlirShapedTypeHasRank(*this)) { + throw SetPyError( + PyExc_ValueError, + "calling this method requires that the type has a rank."); + } + } +}; + +/// Vector Type subclass - VectorType. +class PyVectorType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAVector; + static constexpr const char *pyClassName = "VectorType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](std::vector shape, PyType &elementType, + DefaultingPyLocation loc) { + MlirType t = mlirVectorTypeGetChecked(loc, shape.size(), shape.data(), + elementType); + // TODO: Rework error reporting once diagnostic engine is exposed + // in C API. + if (mlirTypeIsNull(t)) { + throw SetPyError( + PyExc_ValueError, + Twine("invalid '") + + py::repr(py::cast(elementType)).cast() + + "' and expected floating point or integer type."); + } + return PyVectorType(elementType.getContext(), t); + }, + py::arg("shape"), py::arg("elementType"), py::arg("loc") = py::none(), + "Create a vector type"); + } +}; + +/// Ranked Tensor Type subclass - RankedTensorType. +class PyRankedTensorType + : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsARankedTensor; + static constexpr const char *pyClassName = "RankedTensorType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](std::vector shape, PyType &elementType, + DefaultingPyLocation loc) { + MlirType t = mlirRankedTensorTypeGetChecked( + loc, shape.size(), shape.data(), elementType); + // TODO: Rework error reporting once diagnostic engine is exposed + // in C API. + if (mlirTypeIsNull(t)) { + throw SetPyError( + PyExc_ValueError, + Twine("invalid '") + + py::repr(py::cast(elementType)).cast() + + "' and expected floating point, integer, vector or " + "complex " + "type."); + } + return PyRankedTensorType(elementType.getContext(), t); + }, + py::arg("shape"), py::arg("element_type"), py::arg("loc") = py::none(), + "Create a ranked tensor type"); + } +}; + +/// Unranked Tensor Type subclass - UnrankedTensorType. +class PyUnrankedTensorType + : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAUnrankedTensor; + static constexpr const char *pyClassName = "UnrankedTensorType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](PyType &elementType, DefaultingPyLocation loc) { + MlirType t = mlirUnrankedTensorTypeGetChecked(loc, elementType); + // TODO: Rework error reporting once diagnostic engine is exposed + // in C API. + if (mlirTypeIsNull(t)) { + throw SetPyError( + PyExc_ValueError, + Twine("invalid '") + + py::repr(py::cast(elementType)).cast() + + "' and expected floating point, integer, vector or " + "complex " + "type."); + } + return PyUnrankedTensorType(elementType.getContext(), t); + }, + py::arg("element_type"), py::arg("loc") = py::none(), + "Create a unranked tensor type"); + } +}; + +class PyMemRefLayoutMapList; + +/// Ranked MemRef Type subclass - MemRefType. +class PyMemRefType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsARankedTensor; + static constexpr const char *pyClassName = "MemRefType"; + using PyConcreteType::PyConcreteType; + + PyMemRefLayoutMapList getLayout(); + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](std::vector shape, PyType &elementType, + std::vector layout, PyAttribute *memorySpace, + DefaultingPyLocation loc) { + SmallVector maps; + maps.reserve(layout.size()); + for (PyAffineMap &map : layout) + maps.push_back(map); + + MlirAttribute memSpaceAttr = {}; + if (memorySpace) + memSpaceAttr = *memorySpace; + + MlirType t = mlirMemRefTypeGetChecked(loc, elementType, shape.size(), + shape.data(), maps.size(), + maps.data(), memSpaceAttr); + // TODO: Rework error reporting once diagnostic engine is exposed + // in C API. + if (mlirTypeIsNull(t)) { + throw SetPyError( + PyExc_ValueError, + Twine("invalid '") + + py::repr(py::cast(elementType)).cast() + + "' and expected floating point, integer, vector or " + "complex " + "type."); + } + return PyMemRefType(elementType.getContext(), t); + }, + py::arg("shape"), py::arg("element_type"), + py::arg("layout") = py::list(), py::arg("memory_space") = py::none(), + py::arg("loc") = py::none(), "Create a memref type") + .def_property_readonly("layout", &PyMemRefType::getLayout, + "The list of layout maps of the MemRef type.") + .def_property_readonly( + "memory_space", + [](PyMemRefType &self) -> PyAttribute { + MlirAttribute a = mlirMemRefTypeGetMemorySpace(self); + return PyAttribute(self.getContext(), a); + }, + "Returns the memory space of the given MemRef type."); + } +}; + +/// A list of affine layout maps in a memref type. Internally, these are stored +/// as consecutive elements, random access is cheap. Both the type and the maps +/// are owned by the context, no need to worry about lifetime extension. +class PyMemRefLayoutMapList + : public Sliceable { +public: + static constexpr const char *pyClassName = "MemRefLayoutMapList"; + + PyMemRefLayoutMapList(PyMemRefType type, intptr_t startIndex = 0, + intptr_t length = -1, intptr_t step = 1) + : Sliceable(startIndex, + length == -1 ? mlirMemRefTypeGetNumAffineMaps(type) : length, + step), + memref(type) {} + + intptr_t getNumElements() { return mlirMemRefTypeGetNumAffineMaps(memref); } + + PyAffineMap getElement(intptr_t index) { + return PyAffineMap(memref.getContext(), + mlirMemRefTypeGetAffineMap(memref, index)); + } + + PyMemRefLayoutMapList slice(intptr_t startIndex, intptr_t length, + intptr_t step) { + return PyMemRefLayoutMapList(memref, startIndex, length, step); + } + +private: + PyMemRefType memref; +}; + +PyMemRefLayoutMapList PyMemRefType::getLayout() { + return PyMemRefLayoutMapList(*this); +} + +/// Unranked MemRef Type subclass - UnrankedMemRefType. +class PyUnrankedMemRefType + : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAUnrankedMemRef; + static constexpr const char *pyClassName = "UnrankedMemRefType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](PyType &elementType, PyAttribute *memorySpace, + DefaultingPyLocation loc) { + MlirAttribute memSpaceAttr = {}; + if (memorySpace) + memSpaceAttr = *memorySpace; + + MlirType t = + mlirUnrankedMemRefTypeGetChecked(loc, elementType, memSpaceAttr); + // TODO: Rework error reporting once diagnostic engine is exposed + // in C API. + if (mlirTypeIsNull(t)) { + throw SetPyError( + PyExc_ValueError, + Twine("invalid '") + + py::repr(py::cast(elementType)).cast() + + "' and expected floating point, integer, vector or " + "complex " + "type."); + } + return PyUnrankedMemRefType(elementType.getContext(), t); + }, + py::arg("element_type"), py::arg("memory_space"), + py::arg("loc") = py::none(), "Create a unranked memref type") + .def_property_readonly( + "memory_space", + [](PyUnrankedMemRefType &self) -> PyAttribute { + MlirAttribute a = mlirMemRefTypeGetMemorySpace(self); + return PyAttribute(self.getContext(), a); + }, + "Returns the memory space of the given Unranked MemRef type."); + } +}; + +/// Tuple Type subclass - TupleType. +class PyTupleType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsATuple; + static constexpr const char *pyClassName = "TupleType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get_tuple", + [](py::list elementList, DefaultingPyMlirContext context) { + intptr_t num = py::len(elementList); + // Mapping py::list to SmallVector. + SmallVector elements; + for (auto element : elementList) + elements.push_back(element.cast()); + MlirType t = mlirTupleTypeGet(context->get(), num, elements.data()); + return PyTupleType(context->getRef(), t); + }, + py::arg("elements"), py::arg("context") = py::none(), + "Create a tuple type"); + c.def( + "get_type", + [](PyTupleType &self, intptr_t pos) -> PyType { + MlirType t = mlirTupleTypeGetType(self, pos); + return PyType(self.getContext(), t); + }, + "Returns the pos-th type in the tuple type."); + c.def_property_readonly( + "num_types", + [](PyTupleType &self) -> intptr_t { + return mlirTupleTypeGetNumTypes(self); + }, + "Returns the number of types contained in a tuple."); + } +}; + +/// Function type. +class PyFunctionType : public PyConcreteType { +public: + static constexpr IsAFunctionTy isaFunction = mlirTypeIsAFunction; + static constexpr const char *pyClassName = "FunctionType"; + using PyConcreteType::PyConcreteType; + + static void bindDerived(ClassTy &c) { + c.def_static( + "get", + [](std::vector inputs, std::vector results, + DefaultingPyMlirContext context) { + SmallVector inputsRaw(inputs.begin(), inputs.end()); + SmallVector resultsRaw(results.begin(), results.end()); + MlirType t = mlirFunctionTypeGet(context->get(), inputsRaw.size(), + inputsRaw.data(), resultsRaw.size(), + resultsRaw.data()); + return PyFunctionType(context->getRef(), t); + }, + py::arg("inputs"), py::arg("results"), py::arg("context") = py::none(), + "Gets a FunctionType from a list of input and result types"); + c.def_property_readonly( + "inputs", + [](PyFunctionType &self) { + MlirType t = self; + auto contextRef = self.getContext(); + py::list types; + for (intptr_t i = 0, e = mlirFunctionTypeGetNumInputs(self); i < e; + ++i) { + types.append(PyType(contextRef, mlirFunctionTypeGetInput(t, i))); + } + return types; + }, + "Returns the list of input types in the FunctionType."); + c.def_property_readonly( + "results", + [](PyFunctionType &self) { + auto contextRef = self.getContext(); + py::list types; + for (intptr_t i = 0, e = mlirFunctionTypeGetNumResults(self); i < e; + ++i) { + types.append( + PyType(contextRef, mlirFunctionTypeGetResult(self, i))); + } + return types; + }, + "Returns the list of result types in the FunctionType."); + } +}; + +} // namespace + +void mlir::python::populateIRTypes(py::module &m) { + PyIntegerType::bind(m); + PyIndexType::bind(m); + PyBF16Type::bind(m); + PyF16Type::bind(m); + PyF32Type::bind(m); + PyF64Type::bind(m); + PyNoneType::bind(m); + PyComplexType::bind(m); + PyShapedType::bind(m); + PyVectorType::bind(m); + PyRankedTensorType::bind(m); + PyUnrankedTensorType::bind(m); + PyMemRefType::bind(m); + PyMemRefLayoutMapList::bind(m); + PyUnrankedMemRefType::bind(m); + PyTupleType::bind(m); + PyFunctionType::bind(m); +} diff --git a/mlir/lib/Bindings/Python/MainModule.cpp b/mlir/lib/Bindings/Python/MainModule.cpp index 9bfe8b09f6db..5fe0401afaeb 100644 --- a/mlir/lib/Bindings/Python/MainModule.cpp +++ b/mlir/lib/Bindings/Python/MainModule.cpp @@ -12,7 +12,7 @@ #include "ExecutionEngine.h" #include "Globals.h" -#include "IRModules.h" +#include "IRModule.h" #include "Pass.h" namespace py = pybind11; @@ -211,7 +211,10 @@ PYBIND11_MODULE(_mlir, m) { // Define and populate IR submodule. auto irModule = m.def_submodule("ir", "MLIR IR Bindings"); - populateIRSubmodule(irModule); + populateIRCore(irModule); + populateIRAffine(irModule); + populateIRAttributes(irModule); + populateIRTypes(irModule); // Define and populate PassManager submodule. auto passModule = diff --git a/mlir/lib/Bindings/Python/Pass.cpp b/mlir/lib/Bindings/Python/Pass.cpp index dd57647f0327..0e2f5bafb465 100644 --- a/mlir/lib/Bindings/Python/Pass.cpp +++ b/mlir/lib/Bindings/Python/Pass.cpp @@ -8,7 +8,7 @@ #include "Pass.h" -#include "IRModules.h" +#include "IRModule.h" #include "mlir-c/Bindings/Python/Interop.h" #include "mlir-c/Pass.h" -- GitLab From 0de3d1c81428c2a7a4f9a23a5105aa2243fad778 Mon Sep 17 00:00:00 2001 From: Ella Ma Date: Thu, 18 Mar 2021 21:14:13 -0700 Subject: [PATCH 0200/1000] [llvm] Add assertions for the smart pointers with the possibility to be null in ModuleLazyLoaderCache::operator() Split from D91844. The return value of function `ModuleLazyLoaderCache::operator()` in file llvm/tools/llvm-link/llvm-link.cpp. According to the bug report of my static analyzer, the std::function variable `ModuleLazyLoaderCache::createLazyModule` points to function `loadFile`, which may return `nullptr` when error. And the pointer is dereferenced without a check. Reviewed By: tejohnson Differential Revision: https://reviews.llvm.org/D97258 --- llvm/tools/llvm-link/llvm-link.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp index eed49c438335..b01270de727a 100644 --- a/llvm/tools/llvm-link/llvm-link.cpp +++ b/llvm/tools/llvm-link/llvm-link.cpp @@ -246,8 +246,10 @@ public: Module &ModuleLazyLoaderCache::operator()(const char *argv0, const std::string &Identifier) { auto &Module = ModuleMap[Identifier]; - if (!Module) + if (!Module) { Module = createLazyModule(argv0, Identifier); + assert(Module && "Failed to create lazy module!"); + } return *Module; } } // anonymous namespace -- GitLab From e089b5e9e11a61be0a11378f8df9af806807bddc Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 19 Mar 2021 14:20:26 -0700 Subject: [PATCH 0201/1000] [lldb] Call os_log_fault on lldb_assert Call `os_log_fault` when an lldb assert fails. We piggyback off `LLVM_SUPPORT_XCODE_SIGNPOSTS`, which also depends on `os_log`, to avoid having to introduce another CMake check and corresponding define. This patch also adds a small test using lldb-test that verifies we abort with a "regular" assertion when asserts are enabled. Differential revision: https://reviews.llvm.org/D98987 --- lldb/include/lldb/Utility/LLDBAssert.h | 2 +- lldb/source/Utility/LLDBAssert.cpp | 14 +++++++++++++- lldb/test/Shell/Error/assert.test | 4 ++++ lldb/tools/lldb-test/lldb-test.cpp | 12 ++++++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 lldb/test/Shell/Error/assert.test diff --git a/lldb/include/lldb/Utility/LLDBAssert.h b/lldb/include/lldb/Utility/LLDBAssert.h index 845af1d4cc2a..471a2f7e824f 100644 --- a/lldb/include/lldb/Utility/LLDBAssert.h +++ b/lldb/include/lldb/Utility/LLDBAssert.h @@ -20,6 +20,6 @@ namespace lldb_private { void lldb_assert(bool expression, const char *expr_text, const char *func, const char *file, unsigned int line); -} +} // namespace lldb_private #endif // LLDB_UTILITY_LLDBASSERT_H diff --git a/lldb/source/Utility/LLDBAssert.cpp b/lldb/source/Utility/LLDBAssert.cpp index 6ae0ee50ef14..532b56b6f59e 100644 --- a/lldb/source/Utility/LLDBAssert.cpp +++ b/lldb/source/Utility/LLDBAssert.cpp @@ -7,11 +7,15 @@ //===----------------------------------------------------------------------===// #include "lldb/Utility/LLDBAssert.h" - +#include "llvm/Config/config.h" #include "llvm/Support/Format.h" #include "llvm/Support/Signals.h" #include "llvm/Support/raw_ostream.h" +#if LLVM_SUPPORT_XCODE_SIGNPOSTS +#include +#endif + using namespace llvm; using namespace lldb_private; @@ -24,6 +28,14 @@ void lldb_private::lldb_assert(bool expression, const char *expr_text, // If asserts are enabled abort here. assert(false && "lldb_assert failed"); +#if LLVM_SUPPORT_XCODE_SIGNPOSTS + if (__builtin_available(macos 10.12, iOS 10, tvOS 10, watchOS 3, *)) { + os_log_fault(OS_LOG_DEFAULT, + "Assertion failed: (%s), function %s, file %s, line %u\n", + expr_text, func, file, line); + } +#endif + // In a release configuration it will print a warning and encourage the user // to file a bug report, similar to LLVM’s crash handler, and then return // execution. diff --git a/lldb/test/Shell/Error/assert.test b/lldb/test/Shell/Error/assert.test new file mode 100644 index 000000000000..109795f6e8de --- /dev/null +++ b/lldb/test/Shell/Error/assert.test @@ -0,0 +1,4 @@ +# REQUIRES: asserts +# RUN: not --crash lldb-test assert > %t.error 2>&1 +# RUN: cat %t.error | FileCheck %s +# CHECK: Assertion failed: (false && "lldb_assert failed") diff --git a/lldb/tools/lldb-test/lldb-test.cpp b/lldb/tools/lldb-test/lldb-test.cpp index 842a951f384b..1109a6bb6558 100644 --- a/lldb/tools/lldb-test/lldb-test.cpp +++ b/lldb/tools/lldb-test/lldb-test.cpp @@ -29,6 +29,7 @@ #include "lldb/Target/Process.h" #include "lldb/Target/Target.h" #include "lldb/Utility/DataExtractor.h" +#include "lldb/Utility/LLDBAssert.h" #include "lldb/Utility/State.h" #include "lldb/Utility/StreamString.h" @@ -57,6 +58,7 @@ cl::SubCommand ObjectFileSubcommand("object-file", "Display LLDB object file information"); cl::SubCommand SymbolsSubcommand("symbols", "Dump symbols for an object file"); cl::SubCommand IRMemoryMapSubcommand("ir-memory-map", "Test IRMemoryMap"); +cl::SubCommand AssertSubcommand("assert", "Test assert handling"); cl::opt Log("log", cl::desc("Path to a log file"), cl::init(""), cl::sub(BreakpointSubcommand), @@ -236,6 +238,9 @@ bool evalFree(StringRef Line, IRMemoryMapTestState &State); int evaluateMemoryMapCommands(Debugger &Dbg); } // namespace irmemorymap +namespace assert { +int lldb_assert(Debugger &Dbg); +} // namespace assert } // namespace opts std::vector parseCompilerContext() { @@ -1077,6 +1082,11 @@ int opts::irmemorymap::evaluateMemoryMapCommands(Debugger &Dbg) { return 0; } +int opts::assert::lldb_assert(Debugger &Dbg) { + lldbassert(false && "lldb-test assert"); + return 1; +} + int main(int argc, const char *argv[]) { StringRef ToolName = argv[0]; sys::PrintStackTraceOnErrorSignal(ToolName); @@ -1120,6 +1130,8 @@ int main(int argc, const char *argv[]) { return opts::symbols::dumpSymbols(*Dbg); if (opts::IRMemoryMapSubcommand) return opts::irmemorymap::evaluateMemoryMapCommands(*Dbg); + if (opts::AssertSubcommand) + return opts::assert::lldb_assert(*Dbg); WithColor::error() << "No command specified.\n"; return 1; -- GitLab From a1ab5627f012aee9d204cea67d79dc1f172b46f8 Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 19 Mar 2021 14:31:08 -0700 Subject: [PATCH 0202/1000] Revert "[NewPM] Verify LoopAnalysisResults after a loop pass" This reverts commit 94c269baf58330a5e303a4f86f64681f2f7a858b. Still causes too large of compile time regression in normal debug builds. Will put under expensive checks instead. --- llvm/lib/Transforms/Scalar/LoopPassManager.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index bea938a7a9cc..60a9602096bb 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -14,7 +14,6 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Support/Debug.h" #include "llvm/Support/TimeProfiler.h" using namespace llvm; @@ -292,15 +291,8 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, else PI.runAfterPass(*Pass, *L, PassPA); -#ifndef NDEBUG - // LoopAnalysisResults should always be valid. - // Note that we don't LAR.SE.verify() because that can change observed SE - // queries. See PR44815. - LAR.DT.verify(); - LAR.LI.verify(LAR.DT); - if (LAR.MSSA && VerifyMemorySSA) - LAR.MSSA->verifyMemorySSA(); -#endif + // FIXME: We should verify the set of analyses relevant to Loop passes + // are preserved. // If the loop hasn't been deleted, we need to handle invalidation here. if (!Updater.skipCurrentLoop()) -- GitLab From cdac60107db9f04b27077379259678adf6f03617 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 19 Mar 2021 14:31:22 -0700 Subject: [PATCH 0203/1000] [lldb] Update assert.test to be less strict Be less strict when checking for the assert substring. --- lldb/test/Shell/Error/assert.test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lldb/test/Shell/Error/assert.test b/lldb/test/Shell/Error/assert.test index 109795f6e8de..92ccd134b92d 100644 --- a/lldb/test/Shell/Error/assert.test +++ b/lldb/test/Shell/Error/assert.test @@ -1,4 +1,4 @@ # REQUIRES: asserts # RUN: not --crash lldb-test assert > %t.error 2>&1 # RUN: cat %t.error | FileCheck %s -# CHECK: Assertion failed: (false && "lldb_assert failed") +# CHECK: "lldb_assert failed" -- GitLab From 948be862d6dde3ae5ebf3983f78ec2eee5422ed1 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 14:35:22 -0700 Subject: [PATCH 0204/1000] [llvm-readobj] Remove legacy GNU_PROPERTY_X86_ISA_1_{NEEDED,USED} and dump new GNU_PROPERTY_X86_ISA_1_{NEEDED,USED} https://sourceware.org/bugzilla/show_bug.cgi?id=26703 deprecated the previous GNU_PROPERTY_X86_ISA_1_{CMOV,SSE,*} values (renamed to `COMPAT`) and added new values. Since the legacy values are not used by compilers, having dumping support in llvm-readobj is unnecessary. So just drop the legacy feature. The new values are used by GCC 11 (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=97250) `-march=x86-64-v[234]` to indicate the micro-architecture ISA levels. Differential Revision: https://reviews.llvm.org/D98818 --- llvm/include/llvm/BinaryFormat/ELF.h | 42 ++++--------- .../llvm-readobj/ELF/note-gnu-property.s | 24 ++++---- llvm/tools/llvm-readobj/ELFDumper.cpp | 61 ++++++------------- 3 files changed, 45 insertions(+), 82 deletions(-) diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 30209a59ca89..e4144370ebf3 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -1470,10 +1470,14 @@ enum : unsigned { GNU_PROPERTY_NO_COPY_ON_PROTECTED = 2, GNU_PROPERTY_AARCH64_FEATURE_1_AND = 0xc0000000, GNU_PROPERTY_X86_FEATURE_1_AND = 0xc0000002, - GNU_PROPERTY_X86_ISA_1_NEEDED = 0xc0008000, - GNU_PROPERTY_X86_FEATURE_2_NEEDED = 0xc0008001, - GNU_PROPERTY_X86_ISA_1_USED = 0xc0010000, - GNU_PROPERTY_X86_FEATURE_2_USED = 0xc0010001, + + GNU_PROPERTY_X86_UINT32_OR_LO = 0xc0008000, + GNU_PROPERTY_X86_FEATURE_2_NEEDED = GNU_PROPERTY_X86_UINT32_OR_LO + 1, + GNU_PROPERTY_X86_ISA_1_NEEDED = GNU_PROPERTY_X86_UINT32_OR_LO + 2, + + GNU_PROPERTY_X86_UINT32_OR_AND_LO = 0xc0010000, + GNU_PROPERTY_X86_FEATURE_2_USED = GNU_PROPERTY_X86_UINT32_OR_AND_LO + 1, + GNU_PROPERTY_X86_ISA_1_USED = GNU_PROPERTY_X86_UINT32_OR_AND_LO + 2, }; // aarch64 processor feature bits. @@ -1487,31 +1491,6 @@ enum : unsigned { GNU_PROPERTY_X86_FEATURE_1_IBT = 1 << 0, GNU_PROPERTY_X86_FEATURE_1_SHSTK = 1 << 1, - GNU_PROPERTY_X86_ISA_1_CMOV = 1 << 0, - GNU_PROPERTY_X86_ISA_1_SSE = 1 << 1, - GNU_PROPERTY_X86_ISA_1_SSE2 = 1 << 2, - GNU_PROPERTY_X86_ISA_1_SSE3 = 1 << 3, - GNU_PROPERTY_X86_ISA_1_SSSE3 = 1 << 4, - GNU_PROPERTY_X86_ISA_1_SSE4_1 = 1 << 5, - GNU_PROPERTY_X86_ISA_1_SSE4_2 = 1 << 6, - GNU_PROPERTY_X86_ISA_1_AVX = 1 << 7, - GNU_PROPERTY_X86_ISA_1_AVX2 = 1 << 8, - GNU_PROPERTY_X86_ISA_1_FMA = 1 << 9, - GNU_PROPERTY_X86_ISA_1_AVX512F = 1 << 10, - GNU_PROPERTY_X86_ISA_1_AVX512CD = 1 << 11, - GNU_PROPERTY_X86_ISA_1_AVX512ER = 1 << 12, - GNU_PROPERTY_X86_ISA_1_AVX512PF = 1 << 13, - GNU_PROPERTY_X86_ISA_1_AVX512VL = 1 << 14, - GNU_PROPERTY_X86_ISA_1_AVX512DQ = 1 << 15, - GNU_PROPERTY_X86_ISA_1_AVX512BW = 1 << 16, - GNU_PROPERTY_X86_ISA_1_AVX512_4FMAPS = 1 << 17, - GNU_PROPERTY_X86_ISA_1_AVX512_4VNNIW = 1 << 18, - GNU_PROPERTY_X86_ISA_1_AVX512_BITALG = 1 << 19, - GNU_PROPERTY_X86_ISA_1_AVX512_IFMA = 1 << 20, - GNU_PROPERTY_X86_ISA_1_AVX512_VBMI = 1 << 21, - GNU_PROPERTY_X86_ISA_1_AVX512_VBMI2 = 1 << 22, - GNU_PROPERTY_X86_ISA_1_AVX512_VNNI = 1 << 23, - GNU_PROPERTY_X86_FEATURE_2_X86 = 1 << 0, GNU_PROPERTY_X86_FEATURE_2_X87 = 1 << 1, GNU_PROPERTY_X86_FEATURE_2_MMX = 1 << 2, @@ -1522,6 +1501,11 @@ enum : unsigned { GNU_PROPERTY_X86_FEATURE_2_XSAVE = 1 << 7, GNU_PROPERTY_X86_FEATURE_2_XSAVEOPT = 1 << 8, GNU_PROPERTY_X86_FEATURE_2_XSAVEC = 1 << 9, + + GNU_PROPERTY_X86_ISA_1_BASELINE = 1 << 0, + GNU_PROPERTY_X86_ISA_1_V2 = 1 << 1, + GNU_PROPERTY_X86_ISA_1_V3 = 1 << 2, + GNU_PROPERTY_X86_ISA_1_V4 = 1 << 3, }; // FreeBSD note types. diff --git a/llvm/test/tools/llvm-readobj/ELF/note-gnu-property.s b/llvm/test/tools/llvm-readobj/ELF/note-gnu-property.s index 8c3a40ad2f69..2d0d00f60639 100644 --- a/llvm/test/tools/llvm-readobj/ELF/note-gnu-property.s +++ b/llvm/test/tools/llvm-readobj/ELF/note-gnu-property.s @@ -12,10 +12,10 @@ // GNU-NEXT: x86 feature: SHSTK // GNU-NEXT: x86 feature: IBT, SHSTK // GNU-NEXT: x86 feature: -// GNU-NEXT: x86 ISA needed: CMOV, SSE, SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, AVX, AVX2, FMA, AVX512F, AVX512CD -// GNU-NEXT: x86 ISA used: AVX512ER, AVX512PF, AVX512VL, AVX512DQ, AVX512BW, AVX512_4FMAPS, AVX512_4VNNIW, AVX512_BITALG, AVX512_IFMA, AVX512_VBMI, AVX512_VBMI2, AVX512_VNNI // GNU-NEXT: x86 feature needed: x86, x87, MMX, XMM, YMM // GNU-NEXT: x86 feature used: ZMM, FXSR, XSAVE, XSAVEOPT, XSAVEC +// GNU-NEXT: x86 ISA needed: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4 +// GNU-NEXT: x86 ISA used: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4 // GNU-NEXT: // GNU-NEXT: stack size: // GNU-NEXT: stack size: @@ -40,10 +40,10 @@ // LLVM-NEXT: x86 feature: SHSTK // LLVM-NEXT: x86 feature: IBT, SHSTK // LLVM-NEXT: x86 feature: -// LLVM-NEXT: x86 ISA needed: CMOV, SSE, SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, AVX, AVX2, FMA, AVX512F, AVX512CD -// LLVM-NEXT: x86 ISA used: AVX512ER, AVX512PF, AVX512VL, AVX512DQ, AVX512BW, AVX512_4FMAPS, AVX512_4VNNIW, AVX512_BITALG, AVX512_IFMA, AVX512_VBMI, AVX512_VBMI2, AVX512_VNNI // LLVM-NEXT: x86 feature needed: x86, x87, MMX, XMM, YMM // LLVM-NEXT: x86 feature used: ZMM, FXSR, XSAVE, XSAVEOPT, XSAVEC +// LLVM-NEXT: x86 ISA needed: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4 +// LLVM-NEXT: x86 ISA used: x86-64-baseline, x86-64-v2, x86-64-v3, x86-64-v4 // LLVM-NEXT: // LLVM-NEXT: stack size: // LLVM-NEXT: stack size: @@ -96,24 +96,24 @@ begin: .long 0 /* Empty flags, not an error */ .p2align 3 /* Align to 8 byte for 64 bit */ - .long 0xc0008000 /* Type: GNU_PROPERTY_X86_ISA_1_NEEDED */ + .long 0xc0008001 /* Type: GNU_PROPERTY_X86_FEATURE_2_NEEDED */ .long 4 /* Data size */ - .long 0x00000fff /* CMOV, ... */ + .long 0x0000001f /* X86, ... */ .p2align 3 /* Align to 8 byte for 64 bit */ - .long 0xc0010000 /* Type: GNU_PROPERTY_X86_ISA_1_USED */ + .long 0xc0010001 /* Type: GNU_PROPERTY_X86_FEATURE_2_USED */ .long 4 /* Data size */ - .long 0x00fff000 /* AVX512_ER, ... */ + .long 0x000003e0 /* ZMM, ... */ .p2align 3 /* Align to 8 byte for 64 bit */ - .long 0xc0008001 /* Type: GNU_PROPERTY_X86_FEATURE_2_NEEDED */ + .long 0xc0008002 /* Type: GNU_PROPERTY_X86_ISA_1_NEEDED */ .long 4 /* Data size */ - .long 0x0000001f /* X86, ... */ + .long 0x0000000f /* x86-64-baseline, ... */ .p2align 3 /* Align to 8 byte for 64 bit */ - .long 0xc0010001 /* Type: GNU_PROPERTY_X86_FEATURE_2_USED */ + .long 0xc0010002 /* Type: GNU_PROPERTY_X86_ISA_1_USED */ .long 4 /* Data size */ - .long 0x000003e0 /* ZMM, ... */ + .long 0x0000000f /* x86-64-baseline, ... */ .p2align 3 /* Align to 8 byte for 64 bit */ /* All notes below are broken. Test we are able to report them. */ diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 2535ae7830ae..c4938ad7b5f1 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -4736,47 +4736,6 @@ static std::string getGNUProperty(uint32_t Type, uint32_t DataSize, if (PrData) OS << format("", PrData); return OS.str(); - case GNU_PROPERTY_X86_ISA_1_NEEDED: - case GNU_PROPERTY_X86_ISA_1_USED: - OS << "x86 ISA " - << (Type == GNU_PROPERTY_X86_ISA_1_NEEDED ? "needed: " : "used: "); - if (DataSize != 4) { - OS << format("", DataSize); - return OS.str(); - } - PrData = support::endian::read32(Data.data()); - if (PrData == 0) { - OS << ""; - return OS.str(); - } - DumpBit(GNU_PROPERTY_X86_ISA_1_CMOV, "CMOV"); - DumpBit(GNU_PROPERTY_X86_ISA_1_SSE, "SSE"); - DumpBit(GNU_PROPERTY_X86_ISA_1_SSE2, "SSE2"); - DumpBit(GNU_PROPERTY_X86_ISA_1_SSE3, "SSE3"); - DumpBit(GNU_PROPERTY_X86_ISA_1_SSSE3, "SSSE3"); - DumpBit(GNU_PROPERTY_X86_ISA_1_SSE4_1, "SSE4_1"); - DumpBit(GNU_PROPERTY_X86_ISA_1_SSE4_2, "SSE4_2"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX, "AVX"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX2, "AVX2"); - DumpBit(GNU_PROPERTY_X86_ISA_1_FMA, "FMA"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512F, "AVX512F"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512CD, "AVX512CD"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512ER, "AVX512ER"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512PF, "AVX512PF"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512VL, "AVX512VL"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512DQ, "AVX512DQ"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512BW, "AVX512BW"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512_4FMAPS, "AVX512_4FMAPS"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512_4VNNIW, "AVX512_4VNNIW"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512_BITALG, "AVX512_BITALG"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512_IFMA, "AVX512_IFMA"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512_VBMI, "AVX512_VBMI"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512_VBMI2, "AVX512_VBMI2"); - DumpBit(GNU_PROPERTY_X86_ISA_1_AVX512_VNNI, "AVX512_VNNI"); - if (PrData) - OS << format("", PrData); - return OS.str(); - break; case GNU_PROPERTY_X86_FEATURE_2_NEEDED: case GNU_PROPERTY_X86_FEATURE_2_USED: OS << "x86 feature " @@ -4803,6 +4762,26 @@ static std::string getGNUProperty(uint32_t Type, uint32_t DataSize, if (PrData) OS << format("", PrData); return OS.str(); + case GNU_PROPERTY_X86_ISA_1_NEEDED: + case GNU_PROPERTY_X86_ISA_1_USED: + OS << "x86 ISA " + << (Type == GNU_PROPERTY_X86_ISA_1_NEEDED ? "needed: " : "used: "); + if (DataSize != 4) { + OS << format("", DataSize); + return OS.str(); + } + PrData = support::endian::read32(Data.data()); + if (PrData == 0) { + OS << ""; + return OS.str(); + } + DumpBit(GNU_PROPERTY_X86_ISA_1_BASELINE, "x86-64-baseline"); + DumpBit(GNU_PROPERTY_X86_ISA_1_V2, "x86-64-v2"); + DumpBit(GNU_PROPERTY_X86_ISA_1_V3, "x86-64-v3"); + DumpBit(GNU_PROPERTY_X86_ISA_1_V4, "x86-64-v4"); + if (PrData) + OS << format("", PrData); + return OS.str(); } } -- GitLab From cde203e0f9438a4bba3b9b50bd437444852b9909 Mon Sep 17 00:00:00 2001 From: River Riddle Date: Fri, 19 Mar 2021 14:20:14 -0700 Subject: [PATCH 0205/1000] [mlir][Pass] Coalesce dynamic pass pipelines before running This was missed when dynamic pass pipelines were added, and is necessary for maximizing the performance/parallelism potential of the pass pipeline. --- mlir/lib/Pass/Pass.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp index 8507fd6d3451..7e9e3b569962 100644 --- a/mlir/lib/Pass/Pass.cpp +++ b/mlir/lib/Pass/Pass.cpp @@ -381,6 +381,10 @@ LogicalResult OpToOpPassAdaptor::run(Pass *pass, Operation *op, "nested under the current operation the pass is processing"; assert(pipeline.getOpName() == root->getName().getStringRef()); + // Before running, make sure to coalesce any adjacent pass adaptors in the + // pipeline. + pipeline.getImpl().coalesceAdjacentAdaptorPasses(); + // Initialize the user provided pipeline and execute the pipeline. if (failed(pipeline.initialize(root->getContext(), parentInitGeneration))) return failure(); -- GitLab From 4773dd5ba9993e127586a5e5b1993d431a47372c Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Thu, 11 Mar 2021 15:36:01 -0800 Subject: [PATCH 0206/1000] [GlobalISel] Add G_SBFX + G_UBFX (bitfield extraction opcodes) There is a bunch of similar bitfield extraction code throughout *ISelDAGToDAG. E.g, ARMISelDAGToDAG, AArch64ISelDAGToDAG, and AMDGPUISelDAGToDAG all contain code that matches a bitfield extract from an and + right shift. Rather than duplicating code in the same way, this adds two opcodes: - G_UBFX (unsigned bitfield extract) - G_SBFX (signed bitfield extract) They work like this ``` %x = G_UBFX %y, %lsb, %width ``` Where `lsb` and `width` are - The least-significant bit of the extraction - The width of the extraction This will extract `width` bits from `%y`, starting at `lsb`. G_UBFX zero-extends the result, while G_SBFX sign-extends the result. This should allow us to use the combiner to match the bitfield extraction patterns rather than duplicating pattern-matching code in each target. Differential Revision: https://reviews.llvm.org/D98464 --- llvm/docs/GlobalISel/GenericOpcode.rst | 33 +++++++++++++++++++ .../CodeGen/GlobalISel/MachineIRBuilder.h | 12 +++++++ llvm/include/llvm/Support/TargetOpcodes.def | 5 ++- llvm/include/llvm/Target/GenericOpcodes.td | 18 ++++++++++ llvm/lib/CodeGen/MachineVerifier.cpp | 11 +++++++ .../test/MachineVerifier/test_g_ubfx_sbfx.mir | 15 +++++++++ .../GlobalISel/MachineIRBuilderTest.cpp | 22 +++++++++++++ 7 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 llvm/test/MachineVerifier/test_g_ubfx_sbfx.mir diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst index 15a28e5afd17..e37ec24f02b5 100644 --- a/llvm/docs/GlobalISel/GenericOpcode.rst +++ b/llvm/docs/GlobalISel/GenericOpcode.rst @@ -233,6 +233,39 @@ Reverse the order of the bits in a scalar. %1:_(s32) = G_BITREVERSE %0:_(s32) +G_SBFX, G_UBFX +^^^^^^^^^^^^^^ + +Extract a range of bits from a register. + +The source operands are registers as follows: + +- Source +- The least-significant bit for the extraction +- The width of the extraction + +G_SBFX sign-extends the result, while G_UBFX zero-extends the result. + +.. code-block:: none + + ; Extract 5 bits starting at bit 1 from %x and store them in %a. + ; Sign-extend the result. + ; + ; Example: + ; %x = 0...0000[10110]1 ---> %a = 1...111111[10110] + %lsb_one = G_CONSTANT i32 1 + %width_five = G_CONSTANT i32 5 + %a:_(s32) = G_SBFX %x, %lsb_one, %width_five + + ; Extract 3 bits starting at bit 2 from %x and store them in %b. Zero-extend + ; the result. + ; + ; Example: + ; %x = 1...11111[100]11 ---> %b = 0...00000[100] + %lsb_two = G_CONSTANT i32 2 + %width_three = G_CONSTANT i32 3 + %b:_(s32) = G_UBFX %x, %lsb_two, %width_three + Integer Operations ------------------- diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index c916ff14aa14..2812890a344d 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1831,6 +1831,18 @@ public: DstMMO, SrcMMO); } + /// Build and insert \p Dst = G_SBFX \p Src, \p LSB, \p Width. + MachineInstrBuilder buildSbfx(const DstOp &Dst, const SrcOp &Src, + const SrcOp &LSB, const SrcOp &Width) { + return buildInstr(TargetOpcode::G_SBFX, {Dst}, {Src, LSB, Width}); + } + + /// Build and insert \p Dst = G_UBFX \p Src, \p LSB, \p Width. + MachineInstrBuilder buildUbfx(const DstOp &Dst, const SrcOp &Src, + const SrcOp &LSB, const SrcOp &Width) { + return buildInstr(TargetOpcode::G_UBFX, {Dst}, {Src, LSB, Width}); + } + virtual MachineInstrBuilder buildInstr(unsigned Opc, ArrayRef DstOps, ArrayRef SrcOps, Optional Flags = None); diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 2fc1de2d8551..3d450d5adc67 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -749,10 +749,13 @@ HANDLE_TARGET_OPCODE(G_VECREDUCE_SMIN) HANDLE_TARGET_OPCODE(G_VECREDUCE_UMAX) HANDLE_TARGET_OPCODE(G_VECREDUCE_UMIN) +HANDLE_TARGET_OPCODE(G_SBFX) +HANDLE_TARGET_OPCODE(G_UBFX) + /// Marker for the end of the generic opcode. /// This is used to check if an opcode is in the range of the /// generic opcodes. -HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_VECREDUCE_UMIN) +HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_UBFX) /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific post-isel opcode values start here. diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 1732c9577a35..c8b72ee0df51 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -1354,6 +1354,24 @@ def G_MEMSET : GenericInstruction { let mayStore = true; } +//------------------------------------------------------------------------------ +// Bitfield extraction. +//------------------------------------------------------------------------------ + +// Generic signed bitfield extraction. +def G_SBFX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type0:$lsb, type0:$width); + let hasSideEffects = false; +} + +// Generic unsigned bitfield extraction. +def G_UBFX : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src, type0:$lsb, type0:$width); + let hasSideEffects = false; +} + //------------------------------------------------------------------------------ // Optimization hints //------------------------------------------------------------------------------ diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp index 57eb9443a8dd..af8b84e8aaf2 100644 --- a/llvm/lib/CodeGen/MachineVerifier.cpp +++ b/llvm/lib/CodeGen/MachineVerifier.cpp @@ -1566,6 +1566,17 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) { report("Vector reduction requires vector source=", MI); break; } + + case TargetOpcode::G_SBFX: + case TargetOpcode::G_UBFX: { + LLT DstTy = MRI->getType(MI->getOperand(0).getReg()); + if (DstTy.isVector()) { + report("Bitfield extraction is not supported on vectors", MI); + break; + } + break; + } + default: break; } diff --git a/llvm/test/MachineVerifier/test_g_ubfx_sbfx.mir b/llvm/test/MachineVerifier/test_g_ubfx_sbfx.mir new file mode 100644 index 000000000000..dbc6d52d7cfa --- /dev/null +++ b/llvm/test/MachineVerifier/test_g_ubfx_sbfx.mir @@ -0,0 +1,15 @@ +# RUN: not --crash llc -verify-machineinstrs -run-pass none -o /dev/null %s 2>&1 | FileCheck %s +# REQUIRES: aarch64-registered-target + +name: test +body: | + bb.0: + %v1:_(<2 x s64>) = G_IMPLICIT_DEF + %v2:_(<2 x s64>) = G_IMPLICIT_DEF + %v3:_(<2 x s64>) = G_IMPLICIT_DEF + + ; CHECK: *** Bad machine code: Bitfield extraction is not supported on vectors *** + %ubfx_vector:_(<2 x s64>) = G_UBFX %v1, %v2, %v3 + ; CHECK: *** Bad machine code: Bitfield extraction is not supported on vectors *** + %sbfx_vector:_(<2 x s64>) = G_SBFX %v1, %v2, %v3 +... diff --git a/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp b/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp index 8128c3390aa0..daad2f78632b 100644 --- a/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp @@ -398,3 +398,25 @@ TEST_F(AArch64GISelMITest, BuildAddoSubo) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } + +TEST_F(AArch64GISelMITest, BuildBitfieldExtract) { + setUp(); + if (!TM) + return; + LLT S64 = LLT::scalar(64); + SmallVector Copies; + collectCopies(Copies, MF); + + auto Ubfx = B.buildUbfx(S64, Copies[0], Copies[1], Copies[2]); + B.buildSbfx(S64, Ubfx, Copies[0], Copies[2]); + + const auto *CheckStr = R"( + ; CHECK: [[COPY0:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK: [[UBFX:%[0-9]+]]:_(s64) = G_UBFX [[COPY0]]:_, [[COPY1]]:_, [[COPY2]]:_ + ; CHECK: [[SBFX:%[0-9]+]]:_(s64) = G_SBFX [[UBFX]]:_, [[COPY0]]:_, [[COPY2]]:_ + )"; + + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} -- GitLab From a17394dc88cccc669b8c16f8ba8f40f546dafc1b Mon Sep 17 00:00:00 2001 From: Arthur Eubanks Date: Fri, 19 Mar 2021 14:30:48 -0700 Subject: [PATCH 0207/1000] [NewPM] Verify LoopAnalysisResults after a loop pass All loop passes should preserve all analyses in LoopAnalysisResults. Add checks for those when the checks are enabled (which is by default with expensive checks on). Note that due to PR44815, we don't check LAR's ScalarEvolution. Apparently calling SE.verify() can change its results. This is a reland of https://reviews.llvm.org/D98820 which was reverted due to unacceptably large compile time regressions in normal debug builds. --- llvm/lib/Transforms/Scalar/LoopPassManager.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 60a9602096bb..0bb3ec46703c 100644 --- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -14,6 +14,7 @@ #include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/TimeProfiler.h" using namespace llvm; @@ -291,8 +292,17 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F, else PI.runAfterPass(*Pass, *L, PassPA); - // FIXME: We should verify the set of analyses relevant to Loop passes - // are preserved. +#ifndef NDEBUG + // LoopAnalysisResults should always be valid. + // Note that we don't LAR.SE.verify() because that can change observed SE + // queries. See PR44815. + if (VerifyDomInfo) + LAR.DT.verify(); + if (VerifyLoopInfo) + LAR.LI.verify(LAR.DT); + if (LAR.MSSA && VerifyMemorySSA) + LAR.MSSA->verifyMemorySSA(); +#endif // If the loop hasn't been deleted, we need to handle invalidation here. if (!Updater.skipCurrentLoop()) -- GitLab From cb8c1ee269da72eb6e2c18800cd8ab0a74050785 Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Thu, 18 Mar 2021 11:12:17 -0700 Subject: [PATCH 0208/1000] [lldb/PlatformPOSIX] Change LoadImage default to RTLD_LAZY In general, it seems like the debugger should allow programs to load & run with libraries as far as possible, instead of defaulting to being super-picky about unavailable symbols. This is critical on macOS/Darwin, as libswiftCore.dylib may 1) export a version symbol using @available markup and then 2) expect that other exported APIs are only dynamically used once the version symbol is checked. We can't open a version of the library built with a bleeding-edge SDK on an older OS without RTLD_LAXY (or pervasive/expensive @available markup added to dyld APIs). See: https://lists.llvm.org/pipermail/lldb-dev/2021-March/016796.html Differential Revision: https://reviews.llvm.org/D98879 --- .../Plugins/Platform/POSIX/PlatformPOSIX.cpp | 16 +++++- .../API/functionalities/load_lazy/Makefile | 17 ++++++ .../load_lazy/TestLoadUsingLazyBind.py | 54 +++++++++++++++++++ .../API/functionalities/load_lazy/categories | 1 + .../API/functionalities/load_lazy/main.cpp | 3 ++ lldb/test/API/functionalities/load_lazy/t1.c | 3 ++ .../test/API/functionalities/load_lazy/t2_0.c | 1 + .../test/API/functionalities/load_lazy/t2_1.c | 0 8 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 lldb/test/API/functionalities/load_lazy/Makefile create mode 100644 lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py create mode 100644 lldb/test/API/functionalities/load_lazy/categories create mode 100644 lldb/test/API/functionalities/load_lazy/main.cpp create mode 100644 lldb/test/API/functionalities/load_lazy/t1.c create mode 100644 lldb/test/API/functionalities/load_lazy/t2_0.c create mode 100644 lldb/test/API/functionalities/load_lazy/t2_1.c diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp index c8a006001fcb..3e5f1451ef5f 100644 --- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp +++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp @@ -578,7 +578,19 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, // __lldb_dlopen_result for consistency. The wrapper returns a void * but // doesn't use it because UtilityFunctions don't work with void returns at // present. + // + // Use lazy binding so as to not make dlopen()'s success conditional on + // forcing every symbol in the library. + // + // In general, the debugger should allow programs to load & run with + // libraries as far as they can, instead of defaulting to being super-picky + // about unavailable symbols. + // + // The value "1" appears to imply lazy binding (RTLD_LAZY) on both Darwin + // and other POSIX OSes. static const char *dlopen_wrapper_code = R"( + const int RTLD_LAZY = 1; + struct __lldb_dlopen_result { void *image_ptr; const char *error_str; @@ -595,7 +607,7 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, { // This is the case where the name is the full path: if (!path_strings) { - result_ptr->image_ptr = dlopen(name, 2); + result_ptr->image_ptr = dlopen(name, RTLD_LAZY); if (result_ptr->image_ptr) result_ptr->error_str = nullptr; return nullptr; @@ -609,7 +621,7 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, buffer[path_len] = '/'; char *target_ptr = buffer+path_len+1; memcpy((void *) target_ptr, (void *) name, name_len + 1); - result_ptr->image_ptr = dlopen(buffer, 2); + result_ptr->image_ptr = dlopen(buffer, RTLD_LAZY); if (result_ptr->image_ptr) { result_ptr->error_str = nullptr; break; diff --git a/lldb/test/API/functionalities/load_lazy/Makefile b/lldb/test/API/functionalities/load_lazy/Makefile new file mode 100644 index 000000000000..14eff232bb6d --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/Makefile @@ -0,0 +1,17 @@ +CXX_SOURCES := main.cpp + +all: t2_0 t2_1 t1 a.out + +include Makefile.rules + +t1: t2_0 + $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_C_SOURCES=t1.c DYLIB_NAME=t1 LD_EXTRAS="-L. -lt2_0" + +t2_0: + $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_0.c DYLIB_NAME=t2_0 + +t2_1: + $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_1.c DYLIB_NAME=t2_1 diff --git a/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py b/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py new file mode 100644 index 000000000000..18135a18bdaf --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py @@ -0,0 +1,54 @@ +""" +Test that SBProcess.LoadImageUsingPaths uses RTLD_LAZY +""" + + + +import os +import shutil +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +@skipIfRemote +@skipIfWindows # The Windows platform doesn't implement DoLoadImage. +class LoadUsingLazyBind(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + NO_DEBUG_INFO_TESTCASE = True + + def setUp(self): + # Call super's setUp(). + TestBase.setUp(self) + + # Invoke the default build rule. + self.build() + + self.wd = os.path.realpath(self.getBuildDir()) + + self.ext = 'so' + if self.platformIsDarwin(): + self.ext = 'dylib' + + # Overwrite t2_0 with t2_1 to delete the definition of `use`. + shutil.copy(os.path.join(self.wd, 'libt2_1.{}'.format(self.ext)), + os.path.join(self.wd, 'libt2_0.{}'.format(self.ext))) + + @skipIfRemote + @skipIfWindows # The Windows platform doesn't implement DoLoadImage. + def test_load_using_lazy_bind(self): + """Test that we load using RTLD_LAZY""" + + (target, process, thread, _) = lldbutil.run_to_source_breakpoint(self, + "break here", + lldb.SBFileSpec("main.cpp")) + error = lldb.SBError() + lib_spec = lldb.SBFileSpec("libt1.{}".format(self.ext)) + paths = lldb.SBStringList() + paths.AppendString(self.wd) + out_spec = lldb.SBFileSpec() + token = process.LoadImageUsingPaths(lib_spec, paths, out_spec, error) + self.assertNotEqual(token, lldb.LLDB_INVALID_IMAGE_TOKEN, "Got a valid token") diff --git a/lldb/test/API/functionalities/load_lazy/categories b/lldb/test/API/functionalities/load_lazy/categories new file mode 100644 index 000000000000..c00c25822e4c --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/categories @@ -0,0 +1 @@ +basic_process diff --git a/lldb/test/API/functionalities/load_lazy/main.cpp b/lldb/test/API/functionalities/load_lazy/main.cpp new file mode 100644 index 000000000000..ba45ee316cd4 --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/main.cpp @@ -0,0 +1,3 @@ +int main() { + return 0; // break here +} diff --git a/lldb/test/API/functionalities/load_lazy/t1.c b/lldb/test/API/functionalities/load_lazy/t1.c new file mode 100644 index 000000000000..08eae300490f --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/t1.c @@ -0,0 +1,3 @@ +extern void use(); +void f1() {} +void f2() { use(); } diff --git a/lldb/test/API/functionalities/load_lazy/t2_0.c b/lldb/test/API/functionalities/load_lazy/t2_0.c new file mode 100644 index 000000000000..9fc1edfbf460 --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/t2_0.c @@ -0,0 +1 @@ +void use() {} diff --git a/lldb/test/API/functionalities/load_lazy/t2_1.c b/lldb/test/API/functionalities/load_lazy/t2_1.c new file mode 100644 index 000000000000..e69de29bb2d1 -- GitLab From d8d5ef2e9d84fbbc2878b3fd977f9c62ea0661d7 Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Fri, 19 Mar 2021 15:26:16 -0700 Subject: [PATCH 0209/1000] Revert "[lldb/PlatformPOSIX] Change LoadImage default to RTLD_LAZY" This reverts commit cb8c1ee269da72eb6e2c18800cd8ab0a74050785. The test is failing on Debian for unknown reasons. https://lab.llvm.org/buildbot/#/builders/68/builds/8990 --- .../Plugins/Platform/POSIX/PlatformPOSIX.cpp | 16 +----- .../API/functionalities/load_lazy/Makefile | 17 ------ .../load_lazy/TestLoadUsingLazyBind.py | 54 ------------------- .../API/functionalities/load_lazy/categories | 1 - .../API/functionalities/load_lazy/main.cpp | 3 -- lldb/test/API/functionalities/load_lazy/t1.c | 3 -- .../test/API/functionalities/load_lazy/t2_0.c | 1 - .../test/API/functionalities/load_lazy/t2_1.c | 0 8 files changed, 2 insertions(+), 93 deletions(-) delete mode 100644 lldb/test/API/functionalities/load_lazy/Makefile delete mode 100644 lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py delete mode 100644 lldb/test/API/functionalities/load_lazy/categories delete mode 100644 lldb/test/API/functionalities/load_lazy/main.cpp delete mode 100644 lldb/test/API/functionalities/load_lazy/t1.c delete mode 100644 lldb/test/API/functionalities/load_lazy/t2_0.c delete mode 100644 lldb/test/API/functionalities/load_lazy/t2_1.c diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp index 3e5f1451ef5f..c8a006001fcb 100644 --- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp +++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp @@ -578,19 +578,7 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, // __lldb_dlopen_result for consistency. The wrapper returns a void * but // doesn't use it because UtilityFunctions don't work with void returns at // present. - // - // Use lazy binding so as to not make dlopen()'s success conditional on - // forcing every symbol in the library. - // - // In general, the debugger should allow programs to load & run with - // libraries as far as they can, instead of defaulting to being super-picky - // about unavailable symbols. - // - // The value "1" appears to imply lazy binding (RTLD_LAZY) on both Darwin - // and other POSIX OSes. static const char *dlopen_wrapper_code = R"( - const int RTLD_LAZY = 1; - struct __lldb_dlopen_result { void *image_ptr; const char *error_str; @@ -607,7 +595,7 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, { // This is the case where the name is the full path: if (!path_strings) { - result_ptr->image_ptr = dlopen(name, RTLD_LAZY); + result_ptr->image_ptr = dlopen(name, 2); if (result_ptr->image_ptr) result_ptr->error_str = nullptr; return nullptr; @@ -621,7 +609,7 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, buffer[path_len] = '/'; char *target_ptr = buffer+path_len+1; memcpy((void *) target_ptr, (void *) name, name_len + 1); - result_ptr->image_ptr = dlopen(buffer, RTLD_LAZY); + result_ptr->image_ptr = dlopen(buffer, 2); if (result_ptr->image_ptr) { result_ptr->error_str = nullptr; break; diff --git a/lldb/test/API/functionalities/load_lazy/Makefile b/lldb/test/API/functionalities/load_lazy/Makefile deleted file mode 100644 index 14eff232bb6d..000000000000 --- a/lldb/test/API/functionalities/load_lazy/Makefile +++ /dev/null @@ -1,17 +0,0 @@ -CXX_SOURCES := main.cpp - -all: t2_0 t2_1 t1 a.out - -include Makefile.rules - -t1: t2_0 - $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_C_SOURCES=t1.c DYLIB_NAME=t1 LD_EXTRAS="-L. -lt2_0" - -t2_0: - $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_0.c DYLIB_NAME=t2_0 - -t2_1: - $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ - DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_1.c DYLIB_NAME=t2_1 diff --git a/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py b/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py deleted file mode 100644 index 18135a18bdaf..000000000000 --- a/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py +++ /dev/null @@ -1,54 +0,0 @@ -""" -Test that SBProcess.LoadImageUsingPaths uses RTLD_LAZY -""" - - - -import os -import shutil -import lldb -from lldbsuite.test.decorators import * -from lldbsuite.test.lldbtest import * -from lldbsuite.test import lldbutil - - -@skipIfRemote -@skipIfWindows # The Windows platform doesn't implement DoLoadImage. -class LoadUsingLazyBind(TestBase): - - mydir = TestBase.compute_mydir(__file__) - - NO_DEBUG_INFO_TESTCASE = True - - def setUp(self): - # Call super's setUp(). - TestBase.setUp(self) - - # Invoke the default build rule. - self.build() - - self.wd = os.path.realpath(self.getBuildDir()) - - self.ext = 'so' - if self.platformIsDarwin(): - self.ext = 'dylib' - - # Overwrite t2_0 with t2_1 to delete the definition of `use`. - shutil.copy(os.path.join(self.wd, 'libt2_1.{}'.format(self.ext)), - os.path.join(self.wd, 'libt2_0.{}'.format(self.ext))) - - @skipIfRemote - @skipIfWindows # The Windows platform doesn't implement DoLoadImage. - def test_load_using_lazy_bind(self): - """Test that we load using RTLD_LAZY""" - - (target, process, thread, _) = lldbutil.run_to_source_breakpoint(self, - "break here", - lldb.SBFileSpec("main.cpp")) - error = lldb.SBError() - lib_spec = lldb.SBFileSpec("libt1.{}".format(self.ext)) - paths = lldb.SBStringList() - paths.AppendString(self.wd) - out_spec = lldb.SBFileSpec() - token = process.LoadImageUsingPaths(lib_spec, paths, out_spec, error) - self.assertNotEqual(token, lldb.LLDB_INVALID_IMAGE_TOKEN, "Got a valid token") diff --git a/lldb/test/API/functionalities/load_lazy/categories b/lldb/test/API/functionalities/load_lazy/categories deleted file mode 100644 index c00c25822e4c..000000000000 --- a/lldb/test/API/functionalities/load_lazy/categories +++ /dev/null @@ -1 +0,0 @@ -basic_process diff --git a/lldb/test/API/functionalities/load_lazy/main.cpp b/lldb/test/API/functionalities/load_lazy/main.cpp deleted file mode 100644 index ba45ee316cd4..000000000000 --- a/lldb/test/API/functionalities/load_lazy/main.cpp +++ /dev/null @@ -1,3 +0,0 @@ -int main() { - return 0; // break here -} diff --git a/lldb/test/API/functionalities/load_lazy/t1.c b/lldb/test/API/functionalities/load_lazy/t1.c deleted file mode 100644 index 08eae300490f..000000000000 --- a/lldb/test/API/functionalities/load_lazy/t1.c +++ /dev/null @@ -1,3 +0,0 @@ -extern void use(); -void f1() {} -void f2() { use(); } diff --git a/lldb/test/API/functionalities/load_lazy/t2_0.c b/lldb/test/API/functionalities/load_lazy/t2_0.c deleted file mode 100644 index 9fc1edfbf460..000000000000 --- a/lldb/test/API/functionalities/load_lazy/t2_0.c +++ /dev/null @@ -1 +0,0 @@ -void use() {} diff --git a/lldb/test/API/functionalities/load_lazy/t2_1.c b/lldb/test/API/functionalities/load_lazy/t2_1.c deleted file mode 100644 index e69de29bb2d1..000000000000 -- GitLab From 4c2da8641087f7b734337a6e6306329cd2535d60 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 15:42:18 -0700 Subject: [PATCH 0210/1000] [Driver] Suppress GCC detection under -B In GCC, if `-B $prefix` is specified, `$prefix` is used to find executable files and startup files. `$prefix/include` is added as an include search directory. Clang overloads -B with GCC installation detection semantics which make the behavior less predictable (due to the "largest GCC version wins" rule) and interact poorly with --gcc-toolchain (--gcc-toolchain can be overridden by -B). * `clang++ foo.cpp` detects GCC installation under `/usr`. * `clang++ --gcc-toolchain=Inputs foo.cpp` detects GCC installation under `Inputs`. * `clang++ -BA --gcc-toolchain=B foo.cpp` detects GCC installation under A and B and the larger version wins. With this patch, only B is used for detection. * `clang++ -BA foo.cpp` detects GCC installation under `A` and `/usr`, and the larger GCC version wins. With this patch `A` is not used for detection. This patch changes -B to drop the GCC detection semantics. Its executable searching semantics are preserved. --gcc-toolchain is the recommended option to specify the GCC installation detection directory. ( Note: Clang detects GCC installation in various target dependent directories. `$sysroot/usr` (sysroot defaults to "") is a common directory used by most targets. Such a directory is expected to contain something like `lib{,32,64}/gcc{,-cross}/$triple`. Clang will then construct library/include paths from the directory. ) Differential Revision: https://reviews.llvm.org/D97993 --- clang/docs/ReleaseNotes.rst | 7 +++++ clang/lib/Driver/ToolChains/Gnu.cpp | 4 +-- clang/test/Driver/android-ndk-standalone.cpp | 32 ++++++++++---------- clang/test/Driver/android-standalone.cpp | 12 ++++---- clang/test/Driver/gcc-toolchain.cpp | 11 +++++++ clang/test/Driver/print-multi-directory.c | 2 +- 6 files changed, 42 insertions(+), 26 deletions(-) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index c78445b9be6f..d4c9f53b82c0 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -72,6 +72,13 @@ Modified Compiler Flags ----------------------- - -Wshadow now also checks for shadowed structured bindings +- ``-B `` (when ```` is a directory) was overloaded to additionally + detect GCC installations under ```` (``lib{,32,64}/gcc{,-cross}/$triple``). + This behavior was incompatible with GCC, caused interop issues with + ``--gcc-toolchain``, and was thus dropped. Specify ``--gcc-toolchain=`` + instead. ``-B``'s other GCC-compatible semantics are preserved: + ``$prefix/$triple-$file`` and ``$prefix$file`` are searched for executables, + libraries, includes, and data files used by the compiler. Removed Compiler Flags ------------------------- diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index fbf2f29e0514..38971288e38f 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -1909,9 +1909,7 @@ void Generic_GCC::GCCInstallationDetector::init( CandidateBiarchTripleAliases); // Compute the set of prefixes for our search. - SmallVector Prefixes(D.PrefixDirs.begin(), - D.PrefixDirs.end()); - + SmallVector Prefixes; StringRef GCCToolchainDir = getGCCToolchainDir(Args, D.SysRoot); if (GCCToolchainDir != "") { if (GCCToolchainDir.back() == '/') diff --git a/clang/test/Driver/android-ndk-standalone.cpp b/clang/test/Driver/android-ndk-standalone.cpp index c4d939934782..8581963ae00d 100644 --- a/clang/test/Driver/android-ndk-standalone.cpp +++ b/clang/test/Driver/android-ndk-standalone.cpp @@ -3,7 +3,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi21 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck %s // CHECK: {{.*}}clang{{.*}}" "-cc1" @@ -34,7 +34,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi14 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-14 %s // CHECK-14: "-L{{.*}}/sysroot/usr/lib/arm-linux-androideabi/14" @@ -42,7 +42,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi21 -stdlib=libstdc++ \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-STDCXX %s // CHECK-STDCXX: {{.*}}clang{{.*}}" "-cc1" @@ -76,7 +76,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target armv7a-none-linux-androideabi21 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-ARMV7 %s // CHECK-ARMV7: {{.*}}clang{{.*}}" "-cc1" @@ -109,19 +109,19 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi21 \ // RUN: -march=armv7 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-ARMV7 %s // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi21 \ // RUN: -march=armv7a \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-ARMV7 %s // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi21 \ // RUN: -march=armv7-a \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-ARMV7 %s // @@ -129,7 +129,7 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi21 \ // RUN: -mthumb \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-THUMB %s // CHECK-THUMB: {{.*}}clang{{.*}}" "-cc1" @@ -163,7 +163,7 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi21 \ // RUN: -march=armv7-a -mthumb \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-ARMV7THUMB %s // CHECK-ARMV7THUMB: {{.*}}clang{{.*}}" "-cc1" @@ -195,7 +195,7 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi21 \ // RUN: -march=armv7-a -mthumb \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: -print-multi-lib \ // RUN: | FileCheck --check-prefix=CHECK-ARM-MULTILIBS %s @@ -209,13 +209,13 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target armv7a-none-linux-androideabi21 \ // RUN: -mthumb \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-ARMV7THUMB %s // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target aarch64-linux-android21 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-AARCH64 %s // CHECK-AARCH64: {{.*}}clang{{.*}}" "-cc1" @@ -231,7 +231,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm64-linux-android21 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-ARM64 %s // CHECK-ARM64: {{.*}}clang{{.*}}" "-cc1" @@ -248,7 +248,7 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target mipsel-linux-android21 \ // RUN: -mips32 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-MIPS %s // CHECK-MIPS: {{.*}}clang{{.*}}" "-cc1" @@ -263,7 +263,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target i686-linux-android21 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-I686 %s // CHECK-I686: {{.*}}clang{{.*}}" "-cc1" @@ -279,7 +279,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target x86_64-linux-android21 \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-X86_64 %s // CHECK-X86_64: {{.*}}clang{{.*}}" "-cc1" diff --git a/clang/test/Driver/android-standalone.cpp b/clang/test/Driver/android-standalone.cpp index 0f8cf0b1355e..c238fc734716 100644 --- a/clang/test/Driver/android-standalone.cpp +++ b/clang/test/Driver/android-standalone.cpp @@ -3,7 +3,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm-linux-androideabi -stdlib=libstdc++ \ -// RUN: -B%S/Inputs/basic_android_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_tree \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | FileCheck %s // CHECK: {{.*}}clang{{.*}}" "-cc1" @@ -18,7 +18,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target aarch64-linux-android -stdlib=libstdc++ \ -// RUN: -B%S/Inputs/basic_android_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_tree \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-AARCH64 %s // CHECK-AARCH64: {{.*}}clang{{.*}}" "-cc1" @@ -33,7 +33,7 @@ // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target arm64-linux-android -stdlib=libstdc++ \ -// RUN: -B%S/Inputs/basic_android_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_tree \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-ARM64 %s // CHECK-ARM64: {{.*}}clang{{.*}}" "-cc1" @@ -49,7 +49,7 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target mipsel-linux-android \ // RUN: -mips32 -stdlib=libstdc++ \ -// RUN: -B%S/Inputs/basic_android_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_tree \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-MIPS %s // CHECK-MIPS: {{.*}}clang{{.*}}" "-cc1" @@ -65,7 +65,7 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target mipsel-linux-android \ // RUN: -march=mips32 -mips32r2 -stdlib=libstdc++ \ -// RUN: -B%S/Inputs/basic_android_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_tree \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-MIPSR2 %s // CHECK-MIPSR2: {{.*}}clang{{.*}}" "-cc1" @@ -81,7 +81,7 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: -target mipsel-linux-android \ // RUN: -mips32 -march=mips32r2 -stdlib=libstdc++ \ -// RUN: -B%S/Inputs/basic_android_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_tree \ // RUN: --sysroot=%S/Inputs/basic_android_tree/sysroot \ // RUN: | FileCheck --check-prefix=CHECK-MIPSR2-A %s // CHECK-MIPSR2-A: {{.*}}clang{{.*}}" "-cc1" diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp index 6c872f4255c3..cddf9b1bdbca 100644 --- a/clang/test/Driver/gcc-toolchain.cpp +++ b/clang/test/Driver/gcc-toolchain.cpp @@ -29,3 +29,14 @@ // CHECK: "{{[^"]*}}/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK: "-L[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5" // CHECK: "-L[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../.." + +/// Test we don't detect GCC installation under -B. +// RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ +// RUN: --target=aarch64-suse-linux --gcc-toolchain=%S/Inputs/opensuse_42.2_aarch64_tree/usr | \ +// RUN: FileCheck %s --check-prefix=AARCH64 +// RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ +// RUN: --target=aarch64-suse-linux -B%S/Inputs/opensuse_42.2_aarch64_tree/usr | \ +// RUN: FileCheck %s --check-prefix=NO_AARCH64 + +// AARCH64: Inputs{{[^"]+}}aarch64-suse-linux/{{[^"]+}}crt1.o" +// NO_AARCH64-NOT: Inputs{{[^"]+}}aarch64-suse-linux/{{[^"]+}}crt1.o" diff --git a/clang/test/Driver/print-multi-directory.c b/clang/test/Driver/print-multi-directory.c index 5fb6a118e115..2504c28ba994 100644 --- a/clang/test/Driver/print-multi-directory.c +++ b/clang/test/Driver/print-multi-directory.c @@ -19,7 +19,7 @@ // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>/dev/null \ // RUN: -target arm-linux-androideabi21 \ // RUN: -mthumb \ -// RUN: -B%S/Inputs/basic_android_ndk_tree \ +// RUN: --gcc-toolchain=%S/Inputs/basic_android_ndk_tree \ // RUN: --sysroot=%S/Inputs/basic_android_ndk_tree/sysroot \ // RUN: -print-multi-directory \ // RUN: | FileCheck --match-full-lines --check-prefix=CHECK-ARM-MULTILIBS %s -- GitLab From 94a793f096653fa3536f39c6c1b9e3281907619f Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 15:42:37 -0700 Subject: [PATCH 0211/1000] [docs] Improve documentation of -B and --gcc-toolchain Differential Revision: https://reviews.llvm.org/D97902 --- clang/docs/ClangCommandLineReference.rst | 6 +++--- clang/include/clang/Driver/Options.td | 12 ++++++++---- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst index bca5722f80d0..962d717483e0 100644 --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -18,9 +18,9 @@ GCC-compatible ``clang`` and ``clang++`` drivers. .. program:: clang -.. option:: -B, --prefix , --prefix= +.. option:: -B, --prefix , --prefix= -Add to search path for binaries and object files used implicitly +Search $prefix/$triple-$file and $prefix$file for executables, libraries, includes, and data files used by the compiler. $prefix may or may not be a directory .. option:: -F @@ -256,7 +256,7 @@ Build this module as a system module. Only used with -emit-module .. option:: --gcc-toolchain=, -gcc-toolchain -Use the gcc toolchain at the given directory +Search for GCC installation in the specified directory on targets which commonly use GCC. The directory usually contains 'lib{,32,64}/gcc{,-cross}/$triple' and 'include'. If specified, sysroot is skipped for GCC detection. Note: executables (e.g. ld) used by the compiler are not overridden by the selected GCC installation .. option:: -gcodeview diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index b7efb7469a23..85a0e02e6357 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -601,8 +601,14 @@ def _HASH_HASH_HASH : Flag<["-"], "###">, Flags<[NoXarchOption, CoreOption, Flan def _DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>, Flags<[NoXarchOption, CoreOption]>; def A : JoinedOrSeparate<["-"], "A">, Flags<[RenderJoined]>, Group; -def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"">, - HelpText<"Add to search path for binaries and object files used implicitly">; +def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"">, + HelpText<"Search $prefix/$triple-$file and $prefix$file for executables, libraries, " + "includes, and data files used by the compiler. $prefix may or may not be a directory">; +def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>, + HelpText<"Search for GCC installation in the specified directory on targets which commonly use GCC. " + "The directory usually contains 'lib{,32,64}/gcc{,-cross}/$triple' and 'include'. If specified, " + "sysroot is skipped for GCC detection. Note: executables (e.g. ld) used by the compiler are not " + "overridden by the selected GCC installation">; def CC : Flag<["-"], "CC">, Flags<[CC1Option]>, Group, HelpText<"Include comments from within macros in preprocessed output">, MarshallingInfoFlag>; @@ -3673,8 +3679,6 @@ def print_supported_cpus : Flag<["-", "--"], "print-supported-cpus">, MarshallingInfoFlag>; def mcpu_EQ_QUESTION : Flag<["-"], "mcpu=?">, Alias; def mtune_EQ_QUESTION : Flag<["-"], "mtune=?">, Alias; -def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[NoXarchOption]>, - HelpText<"Use the gcc toolchain at the given directory">; def time : Flag<["-"], "time">, HelpText<"Time individual commands">; def traditional_cpp : Flag<["-", "--"], "traditional-cpp">, Flags<[CC1Option]>, -- GitLab From 4bd2bfb6ec0980853d7f9d1874e0547b68b7a61e Mon Sep 17 00:00:00 2001 From: Vedant Kumar Date: Thu, 18 Mar 2021 11:12:17 -0700 Subject: [PATCH 0212/1000] [lldb/PlatformPOSIX] Change LoadImage default to RTLD_LAZY (reapply) In general, it seems like the debugger should allow programs to load & run with libraries as far as possible, instead of defaulting to being super-picky about unavailable symbols. This is critical on macOS/Darwin, as libswiftCore.dylib may 1) export a version symbol using @available markup and then 2) expect that other exported APIs are only dynamically used once the version symbol is checked. We can't open a version of the library built with a bleeding-edge SDK on an older OS without RTLD_LAXY (or pervasive/expensive @available markup added to dyld APIs). This was previously committed as cb8c1ee269da and reverted due to unknown failures on the Linux bots. This version adds additional asserts to check that the shared objects are where we expect them & that calling f1() from libt1 produces the expected value. The Linux failure is tracked by https://bugs.llvm.org/show_bug.cgi?id=49656. See: https://lists.llvm.org/pipermail/lldb-dev/2021-March/016796.html Differential Revision: https://reviews.llvm.org/D98879 --- .../Plugins/Platform/POSIX/PlatformPOSIX.cpp | 16 ++++- lldb/test/API/.lit_test_times.txt | 1 + .../API/functionalities/load_lazy/Makefile | 18 +++++ .../load_lazy/TestLoadUsingLazyBind.py | 65 +++++++++++++++++++ .../API/functionalities/load_lazy/categories | 1 + .../API/functionalities/load_lazy/main.cpp | 3 + lldb/test/API/functionalities/load_lazy/t1.c | 3 + .../test/API/functionalities/load_lazy/t2_0.c | 1 + .../test/API/functionalities/load_lazy/t2_1.c | 0 9 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 lldb/test/API/.lit_test_times.txt create mode 100644 lldb/test/API/functionalities/load_lazy/Makefile create mode 100644 lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py create mode 100644 lldb/test/API/functionalities/load_lazy/categories create mode 100644 lldb/test/API/functionalities/load_lazy/main.cpp create mode 100644 lldb/test/API/functionalities/load_lazy/t1.c create mode 100644 lldb/test/API/functionalities/load_lazy/t2_0.c create mode 100644 lldb/test/API/functionalities/load_lazy/t2_1.c diff --git a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp index c8a006001fcb..3e5f1451ef5f 100644 --- a/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp +++ b/lldb/source/Plugins/Platform/POSIX/PlatformPOSIX.cpp @@ -578,7 +578,19 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, // __lldb_dlopen_result for consistency. The wrapper returns a void * but // doesn't use it because UtilityFunctions don't work with void returns at // present. + // + // Use lazy binding so as to not make dlopen()'s success conditional on + // forcing every symbol in the library. + // + // In general, the debugger should allow programs to load & run with + // libraries as far as they can, instead of defaulting to being super-picky + // about unavailable symbols. + // + // The value "1" appears to imply lazy binding (RTLD_LAZY) on both Darwin + // and other POSIX OSes. static const char *dlopen_wrapper_code = R"( + const int RTLD_LAZY = 1; + struct __lldb_dlopen_result { void *image_ptr; const char *error_str; @@ -595,7 +607,7 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, { // This is the case where the name is the full path: if (!path_strings) { - result_ptr->image_ptr = dlopen(name, 2); + result_ptr->image_ptr = dlopen(name, RTLD_LAZY); if (result_ptr->image_ptr) result_ptr->error_str = nullptr; return nullptr; @@ -609,7 +621,7 @@ PlatformPOSIX::MakeLoadImageUtilityFunction(ExecutionContext &exe_ctx, buffer[path_len] = '/'; char *target_ptr = buffer+path_len+1; memcpy((void *) target_ptr, (void *) name, name_len + 1); - result_ptr->image_ptr = dlopen(buffer, 2); + result_ptr->image_ptr = dlopen(buffer, RTLD_LAZY); if (result_ptr->image_ptr) { result_ptr->error_str = nullptr; break; diff --git a/lldb/test/API/.lit_test_times.txt b/lldb/test/API/.lit_test_times.txt new file mode 100644 index 000000000000..5b848a0183c9 --- /dev/null +++ b/lldb/test/API/.lit_test_times.txt @@ -0,0 +1 @@ +2.777875e+00 functionalities/load_lazy/TestLoadUsingLazyBind.py diff --git a/lldb/test/API/functionalities/load_lazy/Makefile b/lldb/test/API/functionalities/load_lazy/Makefile new file mode 100644 index 000000000000..7200114d03ae --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/Makefile @@ -0,0 +1,18 @@ +CXX_SOURCES := main.cpp +USE_LIBDL := 1 + +all: t2_0 t2_1 t1 a.out + +include Makefile.rules + +t1: t2_0 + $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_C_SOURCES=t1.c DYLIB_NAME=t1 LD_EXTRAS="-L. -lt2_0" + +t2_0: + $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_0.c DYLIB_NAME=t2_0 + +t2_1: + $(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \ + DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_1.c DYLIB_NAME=t2_1 diff --git a/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py b/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py new file mode 100644 index 000000000000..a32e589884ce --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/TestLoadUsingLazyBind.py @@ -0,0 +1,65 @@ +""" +Test that SBProcess.LoadImageUsingPaths uses RTLD_LAZY +""" + + + +import os +import shutil +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class LoadUsingLazyBind(TestBase): + + mydir = TestBase.compute_mydir(__file__) + NO_DEBUG_INFO_TESTCASE = True + + @skipIfRemote + @skipIfWindows # The Windows platform doesn't implement DoLoadImage. + # Failing for unknown reasons on Linux, see + # https://bugs.llvm.org/show_bug.cgi?id=49656. + @skipUnlessDarwin + def test_load_using_lazy_bind(self): + """Test that we load using RTLD_LAZY""" + + self.build() + wd = os.path.realpath(self.getBuildDir()) + + ext = '.so' + if self.platformIsDarwin(): + ext = '.dylib' + + def make_lib_path(name): + libpath = os.path.join(wd, name + ext) + self.assertTrue(os.path.exists(libpath)) + return libpath + + libt1 = make_lib_path('libt1') + libt2_0 = make_lib_path('libt2_0') + libt2_1 = make_lib_path('libt2_1') + + # Overwrite t2_0 with t2_1 to delete the definition of `use`. + shutil.copy(libt2_1, libt2_0) + + # Launch a process and break + (target, process, thread, _) = lldbutil.run_to_source_breakpoint(self, + "break here", + lldb.SBFileSpec("main.cpp")) + + # Load libt1; should fail unless we use RTLD_LAZY + error = lldb.SBError() + lib_spec = lldb.SBFileSpec('libt1' + ext) + paths = lldb.SBStringList() + paths.AppendString(wd) + out_spec = lldb.SBFileSpec() + token = process.LoadImageUsingPaths(lib_spec, paths, out_spec, error) + self.assertNotEqual(token, lldb.LLDB_INVALID_IMAGE_TOKEN, "Got a valid token") + + # Calling `f1()` should return 5. + frame = thread.GetFrameAtIndex(0) + val = frame.EvaluateExpression("f1()") + self.assertTrue(val.IsValid()) + self.assertEquals(val.GetValueAsSigned(-1), 5) diff --git a/lldb/test/API/functionalities/load_lazy/categories b/lldb/test/API/functionalities/load_lazy/categories new file mode 100644 index 000000000000..c00c25822e4c --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/categories @@ -0,0 +1 @@ +basic_process diff --git a/lldb/test/API/functionalities/load_lazy/main.cpp b/lldb/test/API/functionalities/load_lazy/main.cpp new file mode 100644 index 000000000000..ba45ee316cd4 --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/main.cpp @@ -0,0 +1,3 @@ +int main() { + return 0; // break here +} diff --git a/lldb/test/API/functionalities/load_lazy/t1.c b/lldb/test/API/functionalities/load_lazy/t1.c new file mode 100644 index 000000000000..e2fc21327062 --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/t1.c @@ -0,0 +1,3 @@ +extern void use(); +int f1() { return 5; } +void f2() { use(); } diff --git a/lldb/test/API/functionalities/load_lazy/t2_0.c b/lldb/test/API/functionalities/load_lazy/t2_0.c new file mode 100644 index 000000000000..9fc1edfbf460 --- /dev/null +++ b/lldb/test/API/functionalities/load_lazy/t2_0.c @@ -0,0 +1 @@ +void use() {} diff --git a/lldb/test/API/functionalities/load_lazy/t2_1.c b/lldb/test/API/functionalities/load_lazy/t2_1.c new file mode 100644 index 000000000000..e69de29bb2d1 -- GitLab From 528f6f7d617757addac9b51dd5bcc1ab1352e9be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoffer=20Lern=C3=B6?= Date: Fri, 19 Mar 2021 18:55:52 -0400 Subject: [PATCH 0213/1000] Add type attributes to LLVM C API The LLVM C API is missing type attributes as is needed by attributes such as sret and byval. This patch adds three missing wrapper functions. Bugzilla: https://bugs.llvm.org/show_bug.cgi?id=48249 https://reviews.llvm.org/D97763 --- llvm/include/llvm-c/Core.h | 12 ++++++++++++ llvm/lib/IR/Core.cpp | 16 ++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 91c15323451f..b3200520b90a 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -604,6 +604,17 @@ unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A); */ uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A); +/** + * Create a type attribute + */ +LLVMAttributeRef LLVMCreateTypeAttribute(LLVMContextRef C, unsigned KindID, + LLVMTypeRef type_ref); + +/** + * Get the type attribute's value. + */ +LLVMTypeRef LLVMGetTypeAttributeValue(LLVMAttributeRef A); + /** * Create a string attribute. */ @@ -626,6 +637,7 @@ const char *LLVMGetStringAttributeValue(LLVMAttributeRef A, unsigned *Length); */ LLVMBool LLVMIsEnumAttribute(LLVMAttributeRef A); LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A); +LLVMBool LLVMIsTypeAttribute(LLVMAttributeRef A); /** * Obtain a Type from a context by its registered name. diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index 7398a7efd8cd..2d93d50b8899 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -164,6 +164,18 @@ uint64_t LLVMGetEnumAttributeValue(LLVMAttributeRef A) { return Attr.getValueAsInt(); } +LLVMAttributeRef LLVMCreateTypeAttribute(LLVMContextRef C, unsigned KindID, + LLVMTypeRef type_ref) { + auto &Ctx = *unwrap(C); + auto AttrKind = (Attribute::AttrKind)KindID; + return wrap(Attribute::get(Ctx, AttrKind, unwrap(type_ref))); +} + +LLVMTypeRef LLVMGetTypeAttributeValue(LLVMAttributeRef A) { + auto Attr = unwrap(A); + return wrap(Attr.getValueAsType()); +} + LLVMAttributeRef LLVMCreateStringAttribute(LLVMContextRef C, const char *K, unsigned KLength, const char *V, unsigned VLength) { @@ -194,6 +206,10 @@ LLVMBool LLVMIsStringAttribute(LLVMAttributeRef A) { return unwrap(A).isStringAttribute(); } +LLVMBool LLVMIsTypeAttribute(LLVMAttributeRef A) { + return unwrap(A).isTypeAttribute(); +} + char *LLVMGetDiagInfoDescription(LLVMDiagnosticInfoRef DI) { std::string MsgStorage; raw_string_ostream Stream(MsgStorage); -- GitLab From 602e19ed79b8a15500bf7a683cbaa1ca24c9536d Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 19 Mar 2021 14:54:07 -0700 Subject: [PATCH 0214/1000] [JITLink] Don't issue lookups for empty symbol sets. Issuing a lookup for an empty symbol set is legal, but can actually result in unrelated work being done if there was a work queue left over from the previous lookup. We can avoid doing this unrelated work (reducing stack depth and interleaving of debugging output) by not issuing these no-op lookups in the first place. --- .../ExecutionEngine/JITLink/JITLinkGeneric.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp index 2e5b7cbc3745..63f862b96325 100644 --- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp +++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp @@ -66,14 +66,27 @@ void JITLinkerBase::linkPhase1(std::unique_ptr Self) { return Ctx->notifyFailed(std::move(Err)); // Notify client that the defined symbols have been assigned addresses. - LLVM_DEBUG( - { dbgs() << "Resolving symbols defined in " << G->getName() << "\n"; }); + LLVM_DEBUG(dbgs() << "Resolving symbols defined in " << G->getName() << "\n"); if (auto Err = Ctx->notifyResolved(*G)) return Ctx->notifyFailed(std::move(Err)); auto ExternalSymbols = getExternalSymbolNames(); + // If there are no external symbols then proceed immediately with phase 2. + if (ExternalSymbols.empty()) { + LLVM_DEBUG({ + dbgs() << "No external symbols for " << G->getName() + << ". Proceeding immediately with link phase 2.\n"; + }); + // FIXME: Once callee expressions are defined to be sequenced before + // argument expressions (c++17) we can simplify this. See below. + auto &TmpSelf = *Self; + TmpSelf.linkPhase2(std::move(Self), AsyncLookupResult(), std::move(Layout)); + return; + } + + // Otherwise look up the externals. LLVM_DEBUG({ dbgs() << "Issuing lookup for external symbols for " << G->getName() << " (may trigger materialization/linking of other graphs)...\n"; -- GitLab From d90270e9e800d22d4d4dca1bfad05d6a491b42f0 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Fri, 19 Mar 2021 16:24:16 -0700 Subject: [PATCH 0215/1000] Port D97640 to llvm/include/llvm/ProfileData/InstrProfData.inc Differential Revision: https://reviews.llvm.org/D98982 --- llvm/include/llvm/ProfileData/InstrProfData.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc index 6126a61efb72..ffc7dee4ed6d 100644 --- a/llvm/include/llvm/ProfileData/InstrProfData.inc +++ b/llvm/include/llvm/ProfileData/InstrProfData.inc @@ -873,7 +873,7 @@ InstrProfGetRangeRepValue(uint64_t Value) { return Value; else // Otherwise, take to the previous power of two + 1. - return (1 << (64 - InstProfClzll(Value) - 1)) + 1; + return (UINT64_C(1) << (64 - InstProfClzll(Value) - 1)) + 1; } /* Return true if the range that an (observed) memop size value belongs to has -- GitLab From eef8b74ef5efd5265ad35c8d6ebdbfbe43e81bfd Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 19 Mar 2021 16:23:30 -0700 Subject: [PATCH 0216/1000] gn build: Unbreak Android cross-compilation. - D96404 defaulted to libunwind which isn't provided by NDK r21 (or r22), so specify -rtlib=libgcc on non-arm32. - D97993 means that we need to use --gcc-toolchain instead of -B to let the driver find libgcc. --- llvm/utils/gn/build/toolchain/target_flags.gni | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/llvm/utils/gn/build/toolchain/target_flags.gni b/llvm/utils/gn/build/toolchain/target_flags.gni index 0af52a0db6da..573e758a6d6f 100644 --- a/llvm/utils/gn/build/toolchain/target_flags.gni +++ b/llvm/utils/gn/build/toolchain/target_flags.gni @@ -13,8 +13,11 @@ if (current_os == "android") { target_flags += [ "--target=$llvm_current_triple", "--sysroot=$android_ndk_path/toolchains/llvm/prebuilt/linux-x86_64/sysroot", - "-B$android_ndk_path/toolchains/llvm/prebuilt/linux-x86_64", + "--gcc-toolchain=$android_ndk_path/toolchains/llvm/prebuilt/linux-x86_64", ] + if (current_cpu != "arm") { + target_flags += [ "-rtlib=libgcc" ] + } target_ldflags += [ "-static-libstdc++" ] if (current_cpu == "arm") { target_flags += [ "-march=armv7-a" ] -- GitLab From d75a611afbc7c5f8c343e0398dd2b506684e506b Mon Sep 17 00:00:00 2001 From: River Riddle Date: Fri, 19 Mar 2021 16:19:23 -0700 Subject: [PATCH 0217/1000] [mlir] Update `simplifyRegions` to use RewriterBase for erasure notifications This allows for notifying callers when operations/blocks get erased, which is especially useful for the greedy pattern driver. The current greedy pattern driver "throws away" all information on constants in the operation folder because it doesn't know if they get erased or not. By passing in RewriterBase, we can directly track this and prevent the need for the pattern driver to rediscover all of the existing constants. In some situations this cuts the compile time of the canonicalizer in half. Differential Revision: https://reviews.llvm.org/D98755 --- mlir/include/mlir/Transforms/RegionUtils.h | 7 ++- .../Utils/GreedyPatternRewriteDriver.cpp | 7 +-- mlir/lib/Transforms/Utils/RegionUtils.cpp | 46 +++++++++++-------- mlir/test/Dialect/SCF/canonicalize.mlir | 2 +- 4 files changed, 35 insertions(+), 27 deletions(-) diff --git a/mlir/include/mlir/Transforms/RegionUtils.h b/mlir/include/mlir/Transforms/RegionUtils.h index 72c2f51c9e70..c2124d8b70f0 100644 --- a/mlir/include/mlir/Transforms/RegionUtils.h +++ b/mlir/include/mlir/Transforms/RegionUtils.h @@ -15,6 +15,7 @@ #include "llvm/ADT/SetVector.h" namespace mlir { +class RewriterBase; /// Check if all values in the provided range are defined above the `limit` /// region. That is, if they are defined in a region that is a proper ancestor @@ -53,8 +54,10 @@ void getUsedValuesDefinedAbove(MutableArrayRef regions, /// Run a set of structural simplifications over the given regions. This /// includes transformations like unreachable block elimination, dead argument /// elimination, as well as some other DCE. This function returns success if any -/// of the regions were simplified, failure otherwise. -LogicalResult simplifyRegions(MutableArrayRef regions); +/// of the regions were simplified, failure otherwise. The provided rewriter is +/// used to notify callers of operation and block deletion. +LogicalResult simplifyRegions(RewriterBase &rewriter, + MutableArrayRef regions); } // namespace mlir diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp index 9ed3b3514db6..922fbb1bee06 100644 --- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp +++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp @@ -114,7 +114,7 @@ private: // TODO: This is based on the fact that zero use operations // may be deleted, and that single use values often have more // canonicalization opportunities. - if (!operand.use_empty() && !operand.hasOneUse()) + if (!operand || (!operand.use_empty() && !operand.hasOneUse())) continue; if (auto *defInst = operand.getDefiningOp()) addToWorklist(defInst); @@ -202,10 +202,7 @@ bool GreedyPatternRewriteDriver::simplify(MutableArrayRef regions, // After applying patterns, make sure that the CFG of each of the regions is // kept up to date. - if (succeeded(simplifyRegions(regions))) { - folder.clear(); - changed = true; - } + changed |= succeeded(simplifyRegions(*this, regions)); } while (changed && ++i < maxIterations); // Whether the rewrite converges, i.e. wasn't changed in the last iteration. return !changed; diff --git a/mlir/lib/Transforms/Utils/RegionUtils.cpp b/mlir/lib/Transforms/Utils/RegionUtils.cpp index 21d0ff53fdc8..47635c3bbf49 100644 --- a/mlir/lib/Transforms/Utils/RegionUtils.cpp +++ b/mlir/lib/Transforms/Utils/RegionUtils.cpp @@ -9,6 +9,7 @@ #include "mlir/Transforms/RegionUtils.h" #include "mlir/IR/Block.h" #include "mlir/IR/Operation.h" +#include "mlir/IR/PatternMatch.h" #include "mlir/IR/RegionGraphTraits.h" #include "mlir/IR/Value.h" #include "mlir/Interfaces/ControlFlowInterfaces.h" @@ -75,7 +76,8 @@ void mlir::getUsedValuesDefinedAbove(MutableArrayRef regions, /// Erase the unreachable blocks within the provided regions. Returns success /// if any blocks were erased, failure otherwise. // TODO: We could likely merge this with the DCE algorithm below. -static LogicalResult eraseUnreachableBlocks(MutableArrayRef regions) { +static LogicalResult eraseUnreachableBlocks(RewriterBase &rewriter, + MutableArrayRef regions) { // Set of blocks found to be reachable within a given region. llvm::df_iterator_default_set reachable; // If any blocks were found to be dead. @@ -108,7 +110,7 @@ static LogicalResult eraseUnreachableBlocks(MutableArrayRef regions) { for (Block &block : llvm::make_early_inc_range(*region)) { if (!reachable.count(&block)) { block.dropAllDefinedValueUses(); - block.erase(); + rewriter.eraseBlock(&block); erasedDeadBlocks = true; continue; } @@ -305,7 +307,8 @@ static void eraseTerminatorSuccessorOperands(Operation *terminator, } } -static LogicalResult deleteDeadness(MutableArrayRef regions, +static LogicalResult deleteDeadness(RewriterBase &rewriter, + MutableArrayRef regions, LiveMap &liveMap) { bool erasedAnything = false; for (Region ®ion : regions) { @@ -324,10 +327,10 @@ static LogicalResult deleteDeadness(MutableArrayRef regions, if (!liveMap.wasProvenLive(&childOp)) { erasedAnything = true; childOp.dropAllUses(); - childOp.erase(); + rewriter.eraseOp(&childOp); } else { - erasedAnything |= - succeeded(deleteDeadness(childOp.getRegions(), liveMap)); + erasedAnything |= succeeded( + deleteDeadness(rewriter, childOp.getRegions(), liveMap)); } } } @@ -359,7 +362,8 @@ static LogicalResult deleteDeadness(MutableArrayRef regions, // // This function returns success if any operations or arguments were deleted, // failure otherwise. -static LogicalResult runRegionDCE(MutableArrayRef regions) { +static LogicalResult runRegionDCE(RewriterBase &rewriter, + MutableArrayRef regions) { LiveMap liveMap; do { liveMap.resetChanged(); @@ -368,7 +372,7 @@ static LogicalResult runRegionDCE(MutableArrayRef regions) { propagateLiveness(region, liveMap); } while (liveMap.hasChanged()); - return deleteDeadness(regions, liveMap); + return deleteDeadness(rewriter, regions, liveMap); } //===----------------------------------------------------------------------===// @@ -456,7 +460,7 @@ public: LogicalResult addToCluster(BlockEquivalenceData &blockData); /// Try to merge all of the blocks within this cluster into the leader block. - LogicalResult merge(); + LogicalResult merge(RewriterBase &rewriter); private: /// The equivalence data for the leader of the cluster. @@ -550,7 +554,7 @@ static bool ableToUpdatePredOperands(Block *block) { return true; } -LogicalResult BlockMergeCluster::merge() { +LogicalResult BlockMergeCluster::merge(RewriterBase &rewriter) { // Don't consider clusters that don't have blocks to merge. if (blocksToMerge.empty()) return failure(); @@ -613,7 +617,7 @@ LogicalResult BlockMergeCluster::merge() { // Replace all uses of the merged blocks with the leader and erase them. for (Block *block : blocksToMerge) { block->replaceAllUsesWith(leaderBlock); - block->erase(); + rewriter.eraseBlock(block); } return success(); } @@ -621,7 +625,8 @@ LogicalResult BlockMergeCluster::merge() { /// Identify identical blocks within the given region and merge them, inserting /// new block arguments as necessary. Returns success if any blocks were merged, /// failure otherwise. -static LogicalResult mergeIdenticalBlocks(Region ®ion) { +static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter, + Region ®ion) { if (region.empty() || llvm::hasSingleElement(region)) return failure(); @@ -659,7 +664,7 @@ static LogicalResult mergeIdenticalBlocks(Region ®ion) { clusters.emplace_back(std::move(data)); } for (auto &cluster : clusters) - mergedAnyBlocks |= succeeded(cluster.merge()); + mergedAnyBlocks |= succeeded(cluster.merge(rewriter)); } return success(mergedAnyBlocks); @@ -667,14 +672,15 @@ static LogicalResult mergeIdenticalBlocks(Region ®ion) { /// Identify identical blocks within the given regions and merge them, inserting /// new block arguments as necessary. -static LogicalResult mergeIdenticalBlocks(MutableArrayRef regions) { +static LogicalResult mergeIdenticalBlocks(RewriterBase &rewriter, + MutableArrayRef regions) { llvm::SmallSetVector worklist; for (auto ®ion : regions) worklist.insert(®ion); bool anyChanged = false; while (!worklist.empty()) { Region *region = worklist.pop_back_val(); - if (succeeded(mergeIdenticalBlocks(*region))) { + if (succeeded(mergeIdenticalBlocks(rewriter, *region))) { worklist.insert(region); anyChanged = true; } @@ -697,10 +703,12 @@ static LogicalResult mergeIdenticalBlocks(MutableArrayRef regions) { /// includes transformations like unreachable block elimination, dead argument /// elimination, as well as some other DCE. This function returns success if any /// of the regions were simplified, failure otherwise. -LogicalResult mlir::simplifyRegions(MutableArrayRef regions) { - bool eliminatedBlocks = succeeded(eraseUnreachableBlocks(regions)); - bool eliminatedOpsOrArgs = succeeded(runRegionDCE(regions)); - bool mergedIdenticalBlocks = succeeded(mergeIdenticalBlocks(regions)); +LogicalResult mlir::simplifyRegions(RewriterBase &rewriter, + MutableArrayRef regions) { + bool eliminatedBlocks = succeeded(eraseUnreachableBlocks(rewriter, regions)); + bool eliminatedOpsOrArgs = succeeded(runRegionDCE(rewriter, regions)); + bool mergedIdenticalBlocks = + succeeded(mergeIdenticalBlocks(rewriter, regions)); return success(eliminatedBlocks || eliminatedOpsOrArgs || mergedIdenticalBlocks); } diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 2824fdea6e90..0a1558f31c18 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -21,12 +21,12 @@ func @single_iteration(%A: memref) { // CHECK-LABEL: func @single_iteration( // CHECK-SAME: [[ARG0:%.*]]: memref) { +// CHECK: [[C42:%.*]] = constant 42 : i32 // CHECK: [[C0:%.*]] = constant 0 : index // CHECK: [[C2:%.*]] = constant 2 : index // CHECK: [[C3:%.*]] = constant 3 : index // CHECK: [[C6:%.*]] = constant 6 : index // CHECK: [[C7:%.*]] = constant 7 : index -// CHECK: [[C42:%.*]] = constant 42 : i32 // CHECK: scf.parallel ([[V0:%.*]]) = ([[C3]]) to ([[C6]]) step ([[C2]]) { // CHECK: memref.store [[C42]], [[ARG0]]{{\[}}[[C0]], [[V0]], [[C7]]] : memref // CHECK: scf.yield -- GitLab From 1a75be0023cd80fd8560d689999a63d4368c90e6 Mon Sep 17 00:00:00 2001 From: River Riddle Date: Fri, 19 Mar 2021 17:11:23 -0700 Subject: [PATCH 0218/1000] [mlir][NFC] Use the native range instead of APInt when computing operand ranges This removes the need to construct an APInt for each value, given that it is guaranteed to contain 32 bit elements. BEGIN_PUBLIC ...text exposed to open source public git repo... END_PUBLIC --- mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index e137df4244f7..d2f2132b1a38 100644 --- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -90,13 +90,14 @@ const char *adapterSegmentSizeAttrInitCode = R"( auto sizeAttr = odsAttrs.get("{0}").cast<::mlir::DenseIntElementsAttr>(); )"; const char *opSegmentSizeAttrInitCode = R"( - auto sizeAttr = (*this)->getAttrOfType<::mlir::DenseIntElementsAttr>("{0}"); + auto sizeAttr = (*this)->getAttr("{0}").cast<::mlir::DenseIntElementsAttr>(); )"; const char *attrSizedSegmentValueRangeCalcCode = R"( + auto sizeAttrValues = sizeAttr.getValues(); unsigned start = 0; for (unsigned i = 0; i < index; ++i) - start += (*(sizeAttr.begin() + i)).getZExtValue(); - unsigned size = (*(sizeAttr.begin() + index)).getZExtValue(); + start += *(sizeAttrValues.begin() + i); + unsigned size = *(sizeAttrValues.begin() + index); return {start, size}; )"; -- GitLab From 451e7001a097edac229938851b0a50b84a58b514 Mon Sep 17 00:00:00 2001 From: Dhruva Chakrabarti Date: Fri, 19 Mar 2021 17:40:37 -0700 Subject: [PATCH 0219/1000] Empty test commit, verifying commit access -- GitLab From f9cac39930c8ae13892f8daa8662e4ec65439f22 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 17:47:29 -0700 Subject: [PATCH 0220/1000] [Driver] Delete compatibility aliases -mpie-copy-relocations and -mno-pie-copy-relocations They should be unused everywhere. --- clang/docs/ClangCommandLineReference.rst | 4 ---- clang/include/clang/Driver/Options.td | 4 ---- clang/test/Driver/fdirect-access-external-data.c | 4 ---- 3 files changed, 12 deletions(-) diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst index 962d717483e0..d895587c458a 100644 --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -2797,10 +2797,6 @@ Use packed stack layout (SystemZ only). Specify maximum number of prefixes to use for padding -.. option:: -mpie-copy-relocations, -mno-pie-copy-relocations - -Use copy relocations support for PIE builds - .. option:: -mprefer-vector-width= Specifies preferred vector width for auto-vectorization. Defaults to 'none' which allows target specific decisions. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 85a0e02e6357..6e22bd01bea3 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -3323,10 +3323,6 @@ def mstack_protector_guard_offset_EQ : Joined<["-"], "mstack-protector-guard-off def mstack_protector_guard_reg_EQ : Joined<["-"], "mstack-protector-guard-reg=">, Group, Flags<[CC1Option]>, HelpText<"Use the given reg for addressing the stack-protector guard">, MarshallingInfoString, [{"none"}]>; -def mpie_copy_relocations : Flag<["-"], "mpie-copy-relocations">, - Alias, Group; -def mno_pie_copy_relocations : Flag<["-"], "mno-pie-copy-relocations">, - Alias, Group; def mfentry : Flag<["-"], "mfentry">, HelpText<"Insert calls to fentry at function entry (x86/SystemZ only)">, Flags<[CC1Option]>, Group, MarshallingInfoFlag>; diff --git a/clang/test/Driver/fdirect-access-external-data.c b/clang/test/Driver/fdirect-access-external-data.c index c3fc93064179..f132b1b088af 100644 --- a/clang/test/Driver/fdirect-access-external-data.c +++ b/clang/test/Driver/fdirect-access-external-data.c @@ -9,10 +9,6 @@ // RUN: %clang -### -c -target aarch64 %s -fpic 2>&1 | FileCheck %s --check-prefix=DEFAULT // RUN: %clang -### -c -target aarch64 %s -fpic -fdirect-access-external-data 2>&1 | FileCheck %s --check-prefix=DIRECT -/// -m[no-]pie-copy-relocations are aliases for compatibility. -// RUN: %clang -### -c -target riscv64 %s -mno-pie-copy-relocations 2>&1 | FileCheck %s --check-prefix=INDIRECT -// RUN: %clang -### -c -target riscv64 %s -fpic -mpie-copy-relocations 2>&1 | FileCheck %s --check-prefix=DIRECT - // DEFAULT-NOT: direct-access-external-data" // DIRECT: "-fdirect-access-external-data" // INDIRECT: "-fno-direct-access-external-data" -- GitLab From e76b86642f51c868c307d097cb129df1b1ac6423 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Fri, 19 Mar 2021 17:57:17 -0700 Subject: [PATCH 0221/1000] Revert "[lldb] Make the API, Shell and Unit tests independent lit test suites" This reverts commit 6c52d4fd4c24a0cf738e44516ca8378d65dcf019. --- lldb/test/API/CMakeLists.txt | 27 ++++++++++++--------------- lldb/test/API/lit.cfg.py | 5 ++--- lldb/test/API/lit.site.cfg.py.in | 1 + lldb/test/CMakeLists.txt | 18 ++++++++++++------ lldb/test/Shell/CMakeLists.txt | 19 ++++++++----------- lldb/test/Unit/CMakeLists.txt | 12 +----------- lldb/unittests/CMakeLists.txt | 3 +-- 7 files changed, 37 insertions(+), 48 deletions(-) diff --git a/lldb/test/API/CMakeLists.txt b/lldb/test/API/CMakeLists.txt index 2b7dba456b1a..0dbc46defc81 100644 --- a/lldb/test/API/CMakeLists.txt +++ b/lldb/test/API/CMakeLists.txt @@ -1,10 +1,3 @@ -add_custom_target(lldb-api-test-deps) -add_dependencies(lldb-api-test-deps lldb-test-deps) - -add_lit_testsuites(LLDB-API - ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS lldb-api-test-deps) - function(add_python_test_target name test_script args comment) set(PYTHON_TEST_COMMAND ${Python3_EXECUTABLE} @@ -160,35 +153,39 @@ string(REPLACE ${CMAKE_CFG_INTDIR} ${dotest_args_replacement} LLDB_TEST_EXECUTAB string(REPLACE ${CMAKE_CFG_INTDIR} ${dotest_args_replacement} LLDB_TEST_COMPILER "${LLDB_TEST_COMPILER}") string(REPLACE ${CMAKE_CFG_INTDIR} ${dotest_args_replacement} LLDB_TEST_DSYMUTIL "${LLDB_TEST_DSYMUTIL}") +# Configure the API test suite. configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py MAIN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py) +if (CMAKE_GENERATOR STREQUAL "Xcode") + # Xcode does not get the auto-generated targets. We need to create + # check-lldb-api manually. + add_lit_testsuite(check-lldb-api "Running lldb api test suite" + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS lldb-test-deps) +endif() + # Targets for running the test suite on the different Apple simulators. add_lit_testsuite(check-lldb-simulator-ios "Running lldb test suite on the iOS simulator" ${CMAKE_CURRENT_BINARY_DIR} PARAMS "lldb-run-with-simulator=ios" EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-api-test-deps) + DEPENDS lldb-test-deps) add_lit_testsuite(check-lldb-simulator-watchos "Running lldb test suite on the watchOS simulator" ${CMAKE_CURRENT_BINARY_DIR} PARAMS "lldb-run-with-simulator=watchos" EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-api-test-deps) + DEPENDS lldb-test-deps) add_lit_testsuite(check-lldb-simulator-tvos "Running lldb test suite on the tvOS simulator" ${CMAKE_CURRENT_BINARY_DIR} PARAMS "lldb-run-with-simulator=tvos" EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-api-test-deps) - -add_lit_testsuite(check-lldb-api "Running lldb api test suite" - ${CMAKE_CURRENT_BINARY_DIR} - EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-api-test-deps) + DEPENDS lldb-test-deps) diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py index 1bd7dc35fb2a..54a02453b174 100644 --- a/lldb/test/API/lit.cfg.py +++ b/lldb/test/API/lit.cfg.py @@ -17,10 +17,9 @@ config.name = 'lldb-api' config.suffixes = ['.py'] # test_source_root: The root path where tests are located. -config.test_source_root = os.path.dirname(__file__) - # test_exec_root: The root path where tests should be run. -config.test_exec_root = os.path.join(config.lldb_obj_root, 'test') +config.test_source_root = os.path.dirname(__file__) +config.test_exec_root = config.test_source_root def mkdir_p(path): diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in index 49ea94aacd11..2e368325a9f0 100644 --- a/lldb/test/API/lit.site.cfg.py.in +++ b/lldb/test/API/lit.site.cfg.py.in @@ -1,5 +1,6 @@ @LIT_SITE_CFG_IN_HEADER@ +config.test_exec_root = "@LLDB_BINARY_DIR@" config.llvm_src_root = "@LLVM_SOURCE_DIR@" config.llvm_obj_root = "@LLVM_BINARY_DIR@" config.llvm_tools_dir = "@LLVM_TOOLS_DIR@" diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt index c6b01c66a0ef..8363bde23035 100644 --- a/lldb/test/CMakeLists.txt +++ b/lldb/test/CMakeLists.txt @@ -185,13 +185,19 @@ configure_lit_site_cfg( MAIN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py) -add_lit_testsuite(check-lldb "Running lldb lit test suite" +add_lit_testsuites(LLDB + ${CMAKE_CURRENT_SOURCE_DIR} + DEPENDS lldb-test-deps) + +add_lit_testsuite(check-lldb-lit "Running lldb lit test suite" ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS - lldb-api-test-deps - lldb-shell-test-deps - lldb-unit-test-deps) -set_target_properties(check-lldb PROPERTIES FOLDER "lldb tests") + DEPENDS lldb-test-deps) +set_target_properties(check-lldb-lit PROPERTIES FOLDER "lldb tests") + +add_custom_target(check-lldb) +add_dependencies(check-lldb lldb-test-deps) +set_target_properties(check-lldb PROPERTIES FOLDER "lldb misc") +add_dependencies(check-lldb check-lldb-lit) # Add a lit test suite that runs the API & shell test while capturing a # reproducer. diff --git a/lldb/test/Shell/CMakeLists.txt b/lldb/test/Shell/CMakeLists.txt index f0d7b9a34651..d203f1e093c7 100644 --- a/lldb/test/Shell/CMakeLists.txt +++ b/lldb/test/Shell/CMakeLists.txt @@ -1,10 +1,4 @@ -add_custom_target(lldb-shell-test-deps) -add_dependencies(lldb-shell-test-deps lldb-test-deps) - -add_lit_testsuites(LLDB-SHELL - ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS lldb-shell-test-deps) - +# Configure the Shell test suite. configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py @@ -14,7 +8,10 @@ configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/lit-lldb-init.in ${CMAKE_CURRENT_BINARY_DIR}/lit-lldb-init) -add_lit_testsuite(check-lldb-shell "Running lldb shell test suite" - ${CMAKE_CURRENT_BINARY_DIR} - EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-shell-test-deps) +if (CMAKE_GENERATOR STREQUAL "Xcode") + # Xcode does not get the auto-generated targets. We need to create + # check-lldb-shell manually. + add_lit_testsuite(check-lldb-shell "Running lldb shell test suite" + ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS lldb-test-deps) +endif() diff --git a/lldb/test/Unit/CMakeLists.txt b/lldb/test/Unit/CMakeLists.txt index 3233c0873c1f..e9b3d9e35d74 100644 --- a/lldb/test/Unit/CMakeLists.txt +++ b/lldb/test/Unit/CMakeLists.txt @@ -1,17 +1,7 @@ -add_custom_target(lldb-unit-test-deps) -add_dependencies(lldb-unit-test-deps lldb-test-deps) - -add_lit_testsuites(LLDB-UNIT - ${CMAKE_CURRENT_SOURCE_DIR} - DEPENDS lldb-unit-test-deps) - +# Configure the Unit test suite. configure_lit_site_cfg( ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py MAIN_CONFIG ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py) -add_lit_testsuite(check-lldb-unit "Running lldb unit test suite" - ${CMAKE_CURRENT_BINARY_DIR} - EXCLUDE_FROM_CHECK_ALL - DEPENDS lldb-unit-test-deps) diff --git a/lldb/unittests/CMakeLists.txt b/lldb/unittests/CMakeLists.txt index e7b0f1c17d6d..37a5f972cdec 100644 --- a/lldb/unittests/CMakeLists.txt +++ b/lldb/unittests/CMakeLists.txt @@ -1,7 +1,6 @@ add_custom_target(LLDBUnitTests) set_target_properties(LLDBUnitTests PROPERTIES FOLDER "lldb tests") - -add_dependencies(lldb-unit-test-deps LLDBUnitTests) +add_dependencies(lldb-test-deps LLDBUnitTests) include_directories(${LLDB_SOURCE_ROOT}) include_directories(${LLDB_PROJECT_ROOT}/unittests) -- GitLab From b2f232b830efdc02f6350d4b611977270919613d Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Fri, 19 Mar 2021 17:57:47 -0700 Subject: [PATCH 0222/1000] [testsuite] Make testsuite more stable vs canonicalization change. NFC. Differential Revision: https://reviews.llvm.org/D98998 --- .../StandardToSPIRV/legalization.mlir | 6 +- .../VectorToSCF/vector-to-loops.mlir | 27 +++---- mlir/test/Dialect/Linalg/sparse_2d.mlir | 6 +- mlir/test/Dialect/Quant/convert-const.mlir | 12 +-- mlir/test/Dialect/SCF/canonicalize.mlir | 12 +-- .../SPIRV/Transforms/canonicalize.mlir | 81 +++++++++++-------- mlir/test/Dialect/Tensor/canonicalize.mlir | 10 +-- mlir/test/Dialect/Vector/canonicalize.mlir | 14 ++-- .../Vector/vector-contract-transforms.mlir | 9 ++- .../Vector/vector-flat-transforms.mlir | 10 ++- .../vector-transfer-full-partial-split.mlir | 16 ++-- .../Vector/vector-transfer-unroll.mlir | 24 +++--- .../Dialect/Vector/vector-transforms.mlir | 12 +-- mlir/test/Transforms/canonicalize.mlir | 20 ++--- .../Transforms/parallel-loop-collapsing.mlir | 18 ++--- .../single-parallel-loop-collapsing.mlir | 14 ++-- mlir/test/Transforms/test-canonicalize.mlir | 19 +++++ 17 files changed, 169 insertions(+), 141 deletions(-) diff --git a/mlir/test/Conversion/StandardToSPIRV/legalization.mlir b/mlir/test/Conversion/StandardToSPIRV/legalization.mlir index 98b5d930eee1..e7fa980186e0 100644 --- a/mlir/test/Conversion/StandardToSPIRV/legalization.mlir +++ b/mlir/test/Conversion/StandardToSPIRV/legalization.mlir @@ -67,9 +67,9 @@ func @fold_dynamic_stride_subview_with_store(%arg0 : memref<12x32xf32>, %arg1 : // CHECK-SAME: [[ARG0:%.*]]: memref<12x32xf32>, [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index func @fold_static_stride_subview_with_transfer_read(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) -> vector<4xf32> { // CHECK-NOT: memref.subview - // CHECK: [[F1:%.*]] = constant 1.000000e+00 : f32 - // CHECK: [[C2:%.*]] = constant 2 : index - // CHECK: [[C3:%.*]] = constant 3 : index + // CHECK-DAG: [[F1:%.*]] = constant 1.000000e+00 : f32 + // CHECK-DAG: [[C2:%.*]] = constant 2 : index + // CHECK-DAG: [[C3:%.*]] = constant 3 : index // CHECK: [[STRIDE1:%.*]] = muli [[ARG3]], [[C2]] : index // CHECK: [[INDEX1:%.*]] = addi [[ARG1]], [[STRIDE1]] : index // CHECK: [[STRIDE2:%.*]] = muli [[ARG4]], [[C3]] : index diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir index 97e4f4c37dc3..1ebacc8ef274 100644 --- a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir +++ b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir @@ -200,18 +200,16 @@ func @materialize_write(%M: index, %N: index, %O: index, %P: index) { // FULL-UNROLL-SAME: %[[base:[a-zA-Z0-9]+]]: index func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x15xf32> { - // CHECK: %[[cst:.*]] = constant 7.000000e+00 : f32 %f7 = constant 7.0: f32 - + // CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK-DAG: %[[splat:.*]] = constant dense<7.000000e+00> : vector<15xf32> // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<3xvector<15xf32>> - // CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK-DAG: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref // CHECK: affine.for %[[I:.*]] = 0 to 3 { // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] // CHECK: %[[cond1:.*]] = cmpi slt, %[[add]], %[[dim]] : index // CHECK: scf.if %[[cond1]] { - // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%[[add]], %[[base]]], %[[cst]] : memref, vector<15xf32> + // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%[[add]], %[[base]]], %cst : memref, vector<15xf32> // CHECK: store %[[vec_1d]], %[[alloc]][%[[I]]] : memref<3xvector<15xf32>> // CHECK: } else { // CHECK: store %[[splat]], %[[alloc]][%[[I]]] : memref<3xvector<15xf32>> @@ -219,14 +217,13 @@ func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<3xvector<15xf32>> to memref> // CHECK: %[[cst:.*]] = memref.load %[[vmemref]][] : memref> - // FULL-UNROLL: %[[pad:.*]] = constant 7.000000e+00 : f32 // FULL-UNROLL: %[[VEC0:.*]] = constant dense<7.000000e+00> : vector<3x15xf32> // FULL-UNROLL: %[[C0:.*]] = constant 0 : index // FULL-UNROLL: %[[SPLAT:.*]] = constant dense<7.000000e+00> : vector<15xf32> // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref // FULL-UNROLL: cmpi slt, %[[base]], %[[DIM]] : index // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %[[pad]] : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %cst : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -236,7 +233,7 @@ func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]] // FULL-UNROLL: cmpi slt, %{{.*}}, %[[DIM]] : index // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[pad]] : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %cst : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -246,7 +243,7 @@ func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]] // FULL-UNROLL: cmpi slt, %{{.*}}, %[[DIM]] : index // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %[[pad]] : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %cst : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -380,16 +377,16 @@ func @transfer_read_minor_identity(%A : memref) -> vector<3x3xf32> // CHECK-LABEL: transfer_read_minor_identity( // CHECK-SAME: %[[A:.*]]: memref) -> vector<3x3xf32> -// CHECK: %[[c0:.*]] = constant 0 : index -// CHECK: %[[cst:.*]] = constant 0.000000e+00 : f32 -// CHECK: %[[c2:.*]] = constant 2 : index -// CHECK: %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32> +// CHECK-DAG: %[[c0:.*]] = constant 0 : index +// CHECK-DAG: %cst = constant 0.000000e+00 : f32 +// CHECK-DAG: %[[c2:.*]] = constant 2 : index +// CHECK-DAG: %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32> // CHECK: %[[m:.*]] = memref.alloca() : memref<3xvector<3xf32>> // CHECK: %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref // CHECK: affine.for %[[arg1:.*]] = 0 to 3 { // CHECK: %[[cmp:.*]] = cmpi slt, %[[arg1]], %[[d]] : index // CHECK: scf.if %[[cmp]] { -// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%[[c0]], %[[c0]], %[[arg1]], %[[c0]]], %[[cst]] : memref, vector<3xf32> +// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %cst : memref, vector<3xf32> // CHECK: store %[[tr]], %[[m]][%[[arg1]]] : memref<3xvector<3xf32>> // CHECK: } else { // CHECK: store %[[cst0]], %[[m]][%[[arg1]]] : memref<3xvector<3xf32>> @@ -411,8 +408,8 @@ func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref, // CHECK-SAME: %[[B:.*]]: memref) -// CHECK: %[[c0:.*]] = constant 0 : index -// CHECK: %[[c2:.*]] = constant 2 : index +// CHECK-DAG: %[[c2:.*]] = constant 2 : index +// CHECK-DAG: %[[c0:.*]] = constant 0 : index // CHECK: %[[m:.*]] = memref.alloca() : memref<3xvector<3xf32>> // CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref<3xvector<3xf32>> to memref> // CHECK: store %[[A]], %[[cast]][] : memref> diff --git a/mlir/test/Dialect/Linalg/sparse_2d.mlir b/mlir/test/Dialect/Linalg/sparse_2d.mlir index 24ccdfc20b14..b9e14e3afb8e 100644 --- a/mlir/test/Dialect/Linalg/sparse_2d.mlir +++ b/mlir/test/Dialect/Linalg/sparse_2d.mlir @@ -1163,9 +1163,9 @@ func @sum_reduction(%arga: tensor<10x20xf32>, %argx: tensor) -> tensor // CHECK-LABEL: func @scale( // CHECK-SAME: %[[VAL_0:.*]]: tensor, // CHECK-SAME: %[[VAL_1:.*]]: tensor) -> tensor { -// CHECK: %[[VAL_2:.*]] = constant 2.000000e+00 : f64 -// CHECK: %[[VAL_3:.*]] = constant 0 : index -// CHECK: %[[VAL_4:.*]] = constant 1 : index +// CHECK-DAG: %[[VAL_3:.*]] = constant 0 : index +// CHECK-DAG: %[[VAL_4:.*]] = constant 1 : index +// CHECK-DAG: %[[VAL_2:.*]] = constant 2.000000e+00 : f64 // CHECK: %[[VAL_5:.*]] = linalg.sparse_pointers %[[VAL_0]], %[[VAL_4]] : tensor to memref // CHECK: %[[VAL_6:.*]] = linalg.sparse_indices %[[VAL_0]], %[[VAL_4]] : tensor to memref // CHECK: %[[VAL_7:.*]] = linalg.sparse_values %[[VAL_0]] : tensor to memref diff --git a/mlir/test/Dialect/Quant/convert-const.mlir b/mlir/test/Dialect/Quant/convert-const.mlir index bb8f8cf61c9d..fb6baa25ba4c 100644 --- a/mlir/test/Dialect/Quant/convert-const.mlir +++ b/mlir/test/Dialect/Quant/convert-const.mlir @@ -144,9 +144,9 @@ func @const_custom_storage_range_i8_fixedpoint() -> tensor<7xf32> { // CHECK-LABEL: zero_tensors_to_zero_points func @zero_tensors_to_zero_points() -> (tensor<7xf32>, tensor<7xf32>, tensor<7xf32>, tensor<7xf32>) { -// CHECK: %[[cst:.*]] = constant dense<-127> : tensor<7xi8> -// CHECK: %[[cst0:.*]] = constant dense<0> : tensor<7xi8> -// CHECK: %[[cst1:.*]] = constant dense<1> : tensor<7xi8> +// CHECK-DAG: %[[cst1:.*]] = constant dense<1> : tensor<7xi8> +// CHECK-DAG: %[[cst:.*]] = constant dense<-127> : tensor<7xi8> +// CHECK-DAG: %[[cst0:.*]] = constant dense<0> : tensor<7xi8> // CHECK: "quant.scast"(%[[cst0]]) : (tensor<7xi8>) -> tensor<7x!quant.uniform> // CHECK: "quant.scast"(%[[cst]]) : (tensor<7xi8>) -> tensor<7x!quant.uniform:f32, 1.000000e+00:-127>> // CHECK: "quant.scast"(%[[cst0]]) : (tensor<7xi8>) -> tensor<7x!quant.uniform> @@ -176,10 +176,10 @@ func @zero_tensors_to_zero_points() -> (tensor<7xf32>, tensor<7xf32>, tensor<7xf // CHECK-LABEL: per_axis_dense_quantization func @per_axis_dense_quantization() -> (tensor<2x3xf32>, tensor<2x3xf32>) { -// CHECK-NEXT: %[[cst:.*]] = constant dense<{{\[}}[-128, 64, 127], [0, 1, 2]]> : tensor<2x3xi8> -// CHECK-NEXT: %[[cst0:.*]] = constant dense<{{\[}}[-128, -1, 1], [127, 1, 3]]> : tensor<2x3xi8> +// CHECK-DAG: %[[cst0:.*]] = constant dense<{{\[}}[-128, -1, 1], [127, 1, 3]]> : tensor<2x3xi8> +// CHECK-DAG: %[[cst:.*]] = constant dense<{{\[}}[-128, 64, 127], [0, 1, 2]]> : tensor<2x3xi8> // CHECK: "quant.scast"(%[[cst]]) : (tensor<2x3xi8>) -> tensor<2x3x!quant.uniform> -// CHECK: "quant.scast"(%cst_0) : (tensor<2x3xi8>) -> tensor<2x3x!quant.uniform> +// CHECK: "quant.scast"(%[[cst0]]) : (tensor<2x3xi8>) -> tensor<2x3x!quant.uniform> %cst = constant dense<[[-2.0, -0.5, 0.0], [0.0, 1.0, 2.0]]> : tensor<2x3xf32> %1 = "quant.qcast"(%cst) : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform> diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index 0a1558f31c18..dffe9e252eb1 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -21,12 +21,12 @@ func @single_iteration(%A: memref) { // CHECK-LABEL: func @single_iteration( // CHECK-SAME: [[ARG0:%.*]]: memref) { -// CHECK: [[C42:%.*]] = constant 42 : i32 -// CHECK: [[C0:%.*]] = constant 0 : index -// CHECK: [[C2:%.*]] = constant 2 : index -// CHECK: [[C3:%.*]] = constant 3 : index -// CHECK: [[C6:%.*]] = constant 6 : index -// CHECK: [[C7:%.*]] = constant 7 : index +// CHECK-DAG: [[C42:%.*]] = constant 42 : i32 +// CHECK-DAG: [[C7:%.*]] = constant 7 : index +// CHECK-DAG: [[C6:%.*]] = constant 6 : index +// CHECK-DAG: [[C3:%.*]] = constant 3 : index +// CHECK-DAG: [[C2:%.*]] = constant 2 : index +// CHECK-DAG: [[C0:%.*]] = constant 0 : index // CHECK: scf.parallel ([[V0:%.*]]) = ([[C3]]) to ([[C6]]) step ([[C2]]) { // CHECK: memref.store [[C42]], [[ARG0]]{{\[}}[[C0]], [[V0]], [[C7]]] : memref // CHECK: scf.yield diff --git a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir index cc1db79d1c37..48c2502cb914 100644 --- a/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir +++ b/mlir/test/Dialect/SPIRV/Transforms/canonicalize.mlir @@ -92,9 +92,9 @@ func @convert_bitcast_multi_use(%arg0 : vector<2xf32>, %arg1 : !spv.ptr (i32, i32, i32) { - // CHECK: spv.Constant 42 : i32 - // CHECK: spv.Constant -33 : i32 - // CHECK: spv.Constant 6 : i32 + // CHECK-DAG: spv.Constant 6 : i32 + // CHECK-DAG: spv.Constant -33 : i32 + // CHECK-DAG: spv.Constant 42 : i32 %0 = spv.Constant dense<[42, -33, 6]> : vector<3xi32> %1 = spv.CompositeExtract %0[0 : i32] : vector<3xi32> %2 = spv.CompositeExtract %0[1 : i32] : vector<3xi32> @@ -106,8 +106,8 @@ func @extract_vector() -> (i32, i32, i32) { // CHECK-LABEL: extract_array_final func @extract_array_final() -> (i32, i32) { - // CHECK: spv.Constant 4 : i32 - // CHECK: spv.Constant -5 : i32 + // CHECK-DAG: spv.Constant -5 : i32 + // CHECK-DAG: spv.Constant 4 : i32 %0 = spv.Constant [dense<[4, -5]> : vector<2xi32>] : !spv.array<1 x vector<2xi32>> %1 = spv.CompositeExtract %0[0 : i32, 0 : i32] : !spv.array<1 x vector<2 x i32>> %2 = spv.CompositeExtract %0[0 : i32, 1 : i32] : !spv.array<1 x vector<2 x i32>> @@ -192,9 +192,9 @@ func @const_fold_scalar_iadd_normal() -> (i32, i32, i32) { %c5 = spv.Constant 5 : i32 %cn8 = spv.Constant -8 : i32 - // CHECK: spv.Constant 10 - // CHECK: spv.Constant -16 - // CHECK: spv.Constant -3 + // CHECK-DAG: spv.Constant -3 + // CHECK-DAG: spv.Constant -16 + // CHECK-DAG: spv.Constant 10 %0 = spv.IAdd %c5, %c5 : i32 %1 = spv.IAdd %cn8, %cn8 : i32 %2 = spv.IAdd %c5, %cn8 : i32 @@ -210,17 +210,17 @@ func @const_fold_scalar_iadd_flow() -> (i32, i32, i32, i32) { %c5 = spv.Constant -1 : i32 // : 0xffff ffff %c6 = spv.Constant -2 : i32 // : 0xffff fffe + // 0x8000 0000 + 0xffff fffe = 0x1 7fff fffe -> 0x7fff fffe + // CHECK-DAG: spv.Constant 2147483646 + // 0x8000 0000 + 0xffff ffff = 0x1 7fff ffff -> 0x7fff ffff + // CHECK-DAG: spv.Constant 2147483647 + // 0x0000 0002 + 0xffff ffff = 0x1 0000 0001 -> 0x0000 0001 + // CHECK-DAG: spv.Constant 1 // 0x0000 0001 + 0xffff ffff = 0x1 0000 0000 -> 0x0000 0000 - // CHECK: spv.Constant 0 + // CHECK-DAG: spv.Constant 0 %0 = spv.IAdd %c1, %c3 : i32 - // 0x0000 0002 + 0xffff ffff = 0x1 0000 0001 -> 0x0000 0001 - // CHECK: spv.Constant 1 - %1 = spv.IAdd %c2, %c3 : i32 - // 0x8000 0000 + 0xffff ffff = 0x1 7fff ffff -> 0x7fff ffff - // CHECK: spv.Constant 2147483647 + %1 = spv.IAdd %c2, %c3 : i32 %2 = spv.IAdd %c4, %c5 : i32 - // 0x8000 0000 + 0xffff fffe = 0x1 7fff fffe -> 0x7fff fffe - // CHECK: spv.Constant 2147483646 %3 = spv.IAdd %c4, %c6 : i32 return %0, %1, %2, %3: i32, i32, i32, i32 } @@ -259,9 +259,9 @@ func @const_fold_scalar_imul_normal() -> (i32, i32, i32) { %cn8 = spv.Constant -8 : i32 %c7 = spv.Constant 7 : i32 - // CHECK: spv.Constant 35 - // CHECK: spv.Constant -40 - // CHECK: spv.Constant -56 + // CHECK-DAG: spv.Constant -56 + // CHECK-DAG: spv.Constant -40 + // CHECK-DAG: spv.Constant 35 %0 = spv.IMul %c7, %c5 : i32 %1 = spv.IMul %c5, %cn8 : i32 %2 = spv.IMul %cn8, %c7 : i32 @@ -275,13 +275,14 @@ func @const_fold_scalar_imul_flow() -> (i32, i32, i32) { %c3 = spv.Constant 4294967295 : i32 // 2^32 - 1 : 0xffff ffff %c4 = spv.Constant 2147483647 : i32 // 2^31 - 1 : 0x7fff ffff + // (0x7fff ffff << 2) = 0x1 ffff fffc -> 0xffff fffc + // CHECK-DAG: %[[CST4:.*]] = spv.Constant -4 + // (0xffff ffff << 1) = 0x1 ffff fffe -> 0xffff fffe - // CHECK: %[[CST2:.*]] = spv.Constant -2 + // CHECK-DAG: %[[CST2:.*]] = spv.Constant -2 %0 = spv.IMul %c1, %c3 : i32 // (0x7fff ffff << 1) = 0x0 ffff fffe -> 0xffff fffe %1 = spv.IMul %c1, %c4 : i32 - // (0x7fff ffff << 2) = 0x1 ffff fffc -> 0xffff fffc - // CHECK: %[[CST4:.*]] = spv.Constant -4 %2 = spv.IMul %c4, %c2 : i32 // CHECK: return %[[CST2]], %[[CST2]], %[[CST4]] return %0, %1, %2: i32, i32, i32 @@ -317,9 +318,9 @@ func @const_fold_scalar_isub_normal() -> (i32, i32, i32) { %cn8 = spv.Constant -8 : i32 %c7 = spv.Constant 7 : i32 - // CHECK: spv.Constant 2 - // CHECK: spv.Constant 13 - // CHECK: spv.Constant -15 + // CHECK-DAG: spv.Constant -15 + // CHECK-DAG: spv.Constant 13 + // CHECK-DAG: spv.Constant 2 %0 = spv.ISub %c7, %c5 : i32 %1 = spv.ISub %c5, %cn8 : i32 %2 = spv.ISub %cn8, %c7 : i32 @@ -335,17 +336,17 @@ func @const_fold_scalar_isub_flow() -> (i32, i32, i32, i32) { %c5 = spv.Constant -1 : i32 // : 0xffff ffff %c6 = spv.Constant -2 : i32 // : 0xffff fffe + // 0xffff ffff - 0x7fff ffff -> 0xffff ffff + 0x8000 0001 = 0x1 8000 0000 + // CHECK-DAG: spv.Constant -2147483648 + // 0x0000 0001 - 0xffff ffff -> 0x0000 0001 + 0x0000 0001 = 0x0000 0002 + // CHECK-DAG: spv.Constant 2 // 0x0000 0000 - 0xffff ffff -> 0x0000 0000 + 0x0000 0001 = 0x0000 0001 - // CHECK: spv.Constant 1 + // CHECK-DAG: spv.Constant 1 + // 0xffff fffe - 0x7fff ffff -> 0xffff fffe + 0x8000 0001 = 0x1 7fff ffff + // CHECK-DAG: spv.Constant 2147483647 %0 = spv.ISub %c1, %c3 : i32 - // 0x0000 0001 - 0xffff ffff -> 0x0000 0001 + 0x0000 0001 = 0x0000 0002 - // CHECK: spv.Constant 2 %1 = spv.ISub %c2, %c3 : i32 - // 0xffff ffff - 0x7fff ffff -> 0xffff ffff + 0x8000 0001 = 0x1 8000 0000 - // CHECK: spv.Constant -2147483648 %2 = spv.ISub %c5, %c4 : i32 - // 0xffff fffe - 0x7fff ffff -> 0xffff fffe + 0x8000 0001 = 0x1 7fff ffff - // CHECK: spv.Constant 2147483647 %3 = spv.ISub %c6, %c4 : i32 return %0, %1, %2, %3: i32, i32, i32, i32 } @@ -545,12 +546,14 @@ func @canonicalize_selection_op_vector_type(%cond: i1) -> () { // ----- +// CHECK-LABEL: cannot_canonicalize_selection_op_0 + // Store to a different variables. func @cannot_canonicalize_selection_op_0(%cond: i1) -> () { %0 = spv.Constant dense<[0, 1, 2]> : vector<3xi32> - // CHECK: %[[SRC_VALUE_0:.*]] = spv.Constant dense<[1, 2, 3]> : vector<3xi32> + // CHECK-DAG: %[[SRC_VALUE_1:.*]] = spv.Constant dense<[2, 3, 4]> : vector<3xi32> + // CHECK-DAG: %[[SRC_VALUE_0:.*]] = spv.Constant dense<[1, 2, 3]> : vector<3xi32> %1 = spv.Constant dense<[1, 2, 3]> : vector<3xi32> - // CHECK: %[[SRC_VALUE_1:.*]] = spv.Constant dense<[2, 3, 4]> : vector<3xi32> %2 = spv.Constant dense<[2, 3, 4]> : vector<3xi32> // CHECK: %[[DST_VAR_0:.*]] = spv.Variable init({{%.*}}) : !spv.ptr, Function> %3 = spv.Variable init(%0) : !spv.ptr, Function> @@ -582,6 +585,8 @@ func @cannot_canonicalize_selection_op_0(%cond: i1) -> () { // ----- +// CHECK-LABEL: cannot_canonicalize_selection_op_1 + // A conditional block consists of more than 2 operations. func @cannot_canonicalize_selection_op_1(%cond: i1) -> () { %0 = spv.Constant dense<[0, 1, 2]> : vector<3xi32> @@ -618,6 +623,8 @@ func @cannot_canonicalize_selection_op_1(%cond: i1) -> () { // ----- +// CHECK-LABEL: cannot_canonicalize_selection_op_2 + // A control-flow goes into `^then` block from `^else` block. func @cannot_canonicalize_selection_op_2(%cond: i1) -> () { %0 = spv.Constant dense<[0, 1, 2]> : vector<3xi32> @@ -650,11 +657,13 @@ func @cannot_canonicalize_selection_op_2(%cond: i1) -> () { // ----- +// CHECK-LABEL: cannot_canonicalize_selection_op_3 + // `spv.Return` as a block terminator. func @cannot_canonicalize_selection_op_3(%cond: i1) -> () { %0 = spv.Constant dense<[0, 1, 2]> : vector<3xi32> - // CHECK: %[[SRC_VALUE_0:.*]] = spv.Constant dense<[1, 2, 3]> : vector<3xi32> %1 = spv.Constant dense<[1, 2, 3]> : vector<3xi32> + // CHECK: %[[SRC_VALUE_0:.*]] = spv.Constant dense<[1, 2, 3]> : vector<3xi32> // CHECK: %[[SRC_VALUE_1:.*]] = spv.Constant dense<[2, 3, 4]> : vector<3xi32> %2 = spv.Constant dense<[2, 3, 4]> : vector<3xi32> // CHECK: %[[DST_VAR:.*]] = spv.Variable init({{%.*}}) : !spv.ptr, Function> @@ -682,6 +691,8 @@ func @cannot_canonicalize_selection_op_3(%cond: i1) -> () { // ----- +// CHECK-LABEL: cannot_canonicalize_selection_op_4 + // Different memory access attributes. func @cannot_canonicalize_selection_op_4(%cond: i1) -> () { %0 = spv.Constant dense<[0, 1, 2]> : vector<3xi32> diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index c274b6f8b1c9..c8ad16ab9b14 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -69,25 +69,25 @@ func @fold_extract(%arg0 : index) -> (f32, f16, f16, i32) { %const_0 = constant 0 : index %const_1 = constant 1 : index %const_3 = constant 3 : index + // CHECK-DAG: [[C64:%.+]] = constant 64 : i32 + // CHECK-DAG: [[C0:%.+]] = constant 0.{{0*}}e+00 : f16 + // CHECK-DAG: [[CM2:%.+]] = constant -2.{{0*}}e+00 : f16 // Fold an extract into a splat. - // CHECK-NEXT: [[C4:%.+]] = constant 4.{{0*}}e+00 : f32 + // CHECK-DAG: [[C4:%.+]] = constant 4.{{0*}}e+00 : f32 %0 = constant dense<4.0> : tensor<4xf32> %ext_1 = tensor.extract %0[%arg0] : tensor<4xf32> // Fold an extract into a sparse with a sparse index. - // CHECK-NEXT: [[CM2:%.+]] = constant -2.{{0*}}e+00 : f16 %1 = constant sparse<[[0, 0, 0], [1, 1, 1]], [-5.0, -2.0]> : tensor<4x4x4xf16> %ext_2 = tensor.extract %1[%const_1, %const_1, %const_1] : tensor<4x4x4xf16> // Fold an extract into a sparse with a non sparse index. - // CHECK-NEXT: [[C0:%.+]] = constant 0.{{0*}}e+00 : f16 %2 = constant sparse<[[1, 1, 1]], [-2.0]> : tensor<1x1x1xf16> %ext_3 = tensor.extract %2[%const_0, %const_0, %const_0] : tensor<1x1x1xf16> // Fold an extract into a dense tensor. - // CHECK-NEXT: [[C64:%.+]] = constant 64 : i32 - %3 = constant dense<[[[1, -2, 1, 36]], [[0, 2, -1, 64]]]> : tensor<2x1x4xi32> + %3 = constant dense<[[[1, -2, 1, 36]], [[0, 2, -1, 64]]]> : tensor<2x1x4xi32> %ext_4 = tensor.extract %3[%const_1, %const_0, %const_3] : tensor<2x1x4xi32> // CHECK-NEXT: return [[C4]], [[CM2]], [[C0]], [[C64]] diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index a68c7fba8e1c..c6ec156e1519 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -571,10 +571,10 @@ func @bitcast_folding(%I1: vector<4x8xf32>, %I2: vector<2xi32>) -> (vector<4x8xf } // CHECK-LABEL: func @bitcast_f16_to_f32 -// bit pattern: 0x00000000 -// CHECK: %[[CST0:.+]] = constant dense<0.000000e+00> : vector<4xf32> // bit pattern: 0x40004000 -// CHECK: %[[CST1:.+]] = constant dense<2.00390625> : vector<4xf32> +// CHECK-DAG: %[[CST1:.+]] = constant dense<2.00390625> : vector<4xf32> +// bit pattern: 0x00000000 +// CHECK-DAG: %[[CST0:.+]] = constant dense<0.000000e+00> : vector<4xf32> // CHECK: return %[[CST0]], %[[CST1]] func @bitcast_f16_to_f32() -> (vector<4xf32>, vector<4xf32>) { %cst0 = constant dense<0.0> : vector<8xf16> // bit pattern: 0x0000 @@ -612,8 +612,8 @@ func @broadcast_folding2() -> vector<4x16xi32> { // ----- // CHECK-LABEL: shape_cast_constant -// CHECK: %[[CST0:.*]] = constant dense<2.000000e+00> : vector<20x2xf32> -// CHECK: %[[CST1:.*]] = constant dense<1> : vector<3x4x2xi32> +// CHECK-DAG: %[[CST1:.*]] = constant dense<1> : vector<3x4x2xi32> +// CHECK-DAG: %[[CST0:.*]] = constant dense<2.000000e+00> : vector<20x2xf32> // CHECK: return %[[CST0]], %[[CST1]] : vector<20x2xf32>, vector<3x4x2xi32> func @shape_cast_constant() -> (vector<20x2xf32>, vector<3x4x2xi32>) { %cst = constant dense<2.000000e+00> : vector<5x4x2xf32> @@ -626,8 +626,8 @@ func @shape_cast_constant() -> (vector<20x2xf32>, vector<3x4x2xi32>) { // ----- // CHECK-LABEL: extract_strided_constant -// CHECK: %[[CST0:.*]] = constant dense<2.000000e+00> : vector<12x2xf32> -// CHECK: %[[CST1:.*]] = constant dense<1> : vector<2x13x3xi32> +// CHECK-DAG: %[[CST1:.*]] = constant dense<1> : vector<2x13x3xi32> +// CHECK-DAG: %[[CST0:.*]] = constant dense<2.000000e+00> : vector<12x2xf32> // CHECK: return %[[CST0]], %[[CST1]] : vector<12x2xf32>, vector<2x13x3xi32> func @extract_strided_constant() -> (vector<12x2xf32>, vector<2x13x3xi32>) { %cst = constant dense<2.000000e+00> : vector<29x7xf32> diff --git a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir index 3adb18c1a2ae..bf13b273d328 100644 --- a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir @@ -431,8 +431,9 @@ func @nop_shape_cast(%arg0: vector<16xf32>) -> vector<16xf32> { } // CHECK-LABEL: func @cancel_shape_cast -// CHECK-SAME: %[[A:.*]]: vector<16xf32> -// CHECK: return %[[A]] : vector<16xf32> +// FIXME: PR49590 +// HECK-SAME: %[[A:.*]]: vector<16xf32> +// HECK: return %[[A]] : vector<16xf32> func @cancel_shape_cast(%arg0: vector<16xf32>) -> vector<16xf32> { %0 = vector.shape_cast %arg0 : vector<16xf32> to vector<4x4xf32> @@ -444,8 +445,8 @@ func @cancel_shape_cast(%arg0: vector<16xf32>) -> vector<16xf32> { // llvm.matrix operations // CHECK-LABEL: func @shape_casts func @shape_casts(%a: vector<2x2xf32>) -> (vector<4xf32>, vector<2x2xf32>) { - // CHECK: %[[cst:.*]] = constant dense<0.000000e+00> : vector<4xf32> - // CHECK: %[[cst22:.*]] = constant dense<0.000000e+00> : vector<2x2xf32> + // CHECK-DAG: %[[cst22:.*]] = constant dense<0.000000e+00> : vector<2x2xf32> + // CHECK-DAG: %[[cst:.*]] = constant dense<0.000000e+00> : vector<4xf32> // CHECK: %[[ex0:.*]] = vector.extract %{{.*}}[0] : vector<2x2xf32> // // CHECK: %[[in0:.*]] = vector.insert_strided_slice %[[ex0]], %[[cst]] diff --git a/mlir/test/Dialect/Vector/vector-flat-transforms.mlir b/mlir/test/Dialect/Vector/vector-flat-transforms.mlir index c07d651d985e..8d51d323a1a7 100644 --- a/mlir/test/Dialect/Vector/vector-flat-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-flat-transforms.mlir @@ -22,10 +22,12 @@ func @transpose44_44(%arg0: vector<4x4xf32>) -> vector<4x4xf32> { // Folds preceding shape cast as expected, // no following shape cast folding expected. // +// FIXME: PR49590 - shape_cast not stable. +// // CHECK-LABEL: func @transpose16_44( // CHECK-SAME: %[[A:.*]]: vector<16xf32> -// CHECK: %[[T0:.*]] = vector.flat_transpose %[[A]] {columns = 4 : i32, rows = 4 : i32} : vector<16xf32> -> vector<16xf32> -// CHECK: %[[T1:.*]] = vector.extract_strided_slice %[[T0]] {offsets = [0], sizes = [4], strides = [1]} : vector<16xf32> to vector<4xf32> +// HECK: %[[T0:.*]] = vector.flat_transpose %[[A]] {columns = 4 : i32, rows = 4 : i32} : vector<16xf32> -> vector<16xf32> +// HECK: %[[T1:.*]] = vector.extract_strided_slice %[[T0]] {offsets = [0], sizes = [4], strides = [1]} : vector<16xf32> to vector<4xf32> // func @transpose16_44(%arg0: vector<16xf32>) -> vector<4x4xf32> { %0 = vector.shape_cast %arg0 : vector<16xf32> to vector<4x4xf32> @@ -49,9 +51,11 @@ func @transpose44_16(%arg0: vector<4x4xf32>) -> vector<16xf32> { // Folds preceding shape cast as expected, // but FAILS to fold following cast. // +// FIXME: PR49590 - shape_cast not stable. +// // CHECK-LABEL: func @transpose16_16( // CHECK-SAME: %[[A:.*]]: vector<16xf32> -// CHECK: %[[T0:.*]] = vector.flat_transpose %[[A]] {columns = 4 : i32, rows = 4 : i32} : vector<16xf32> -> vector<16xf32> +// HECK: %[[T0:.*]] = vector.flat_transpose %[[A]] {columns = 4 : i32, rows = 4 : i32} : vector<16xf32> -> vector<16xf32> // func @transpose16_16(%arg0: vector<16xf32>) -> vector<16xf32> { %0 = vector.shape_cast %arg0 : vector<16xf32> to vector<4x4xf32> diff --git a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir index 21d749cc088f..74b64ee1e263 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir @@ -25,9 +25,8 @@ func @split_vector_transfer_read_2d(%A: memref, %i: index, %j: index) - %c0 = constant 0 : index %f0 = constant 0.0 : f32 - // CHECK-DAG: %[[c0:.*]] = constant 0 : index // CHECK-DAG: %[[c8:.*]] = constant 8 : index - // CHECK-DAG: %[[cst:.*]] = constant 0.000000e+00 : f32 + // CHECK-DAG: %[[c0:.*]] = constant 0 : index // alloca for boundary full tile // CHECK: %[[alloc:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32> // %i + 4 <= dim(%A, 0) @@ -54,13 +53,12 @@ func @split_vector_transfer_read_2d(%A: memref, %i: index, %j: index) - // CHECK: scf.yield %[[yielded]], %[[c0]], %[[c0]] : // CHECK-SAME: memref, index, index // CHECK: } - // CHECK: %[[res:.*]] = vector.transfer_read %[[ifres]]#0[%[[ifres]]#1, %[[ifres]]#2], %[[cst]] + // CHECK: %[[res:.*]] = vector.transfer_read %[[ifres]]#0[%[[ifres]]#1, %[[ifres]]#2], %cst // CHECK_SAME: {masked = [false, false]} : memref, vector<4x8xf32> // LINALG-DAG: %[[c0:.*]] = constant 0 : index // LINALG-DAG: %[[c4:.*]] = constant 4 : index // LINALG-DAG: %[[c8:.*]] = constant 8 : index - // LINALG-DAG: %[[cst:.*]] = constant 0.000000e+00 : f32 // alloca for boundary full tile // LINALG: %[[alloc:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32> // %i + 4 <= dim(%A, 0) @@ -77,7 +75,7 @@ func @split_vector_transfer_read_2d(%A: memref, %i: index, %j: index) - // LINALG: scf.yield %[[A]], %[[i]], %[[j]] : memref, index, index // LINALG: } else { // slow path, fill tmp alloc and yield a memref_casted version of it - // LINALG: linalg.fill(%[[alloc]], %[[cst]]) : memref<4x8xf32>, f32 + // LINALG: linalg.fill(%[[alloc]], %cst) : memref<4x8xf32>, f32 // LINALG: %[[d0:.*]] = memref.dim %[[A]], %[[c0]] : memref // LINALG: %[[sv0:.*]] = affine.min #[[$bounds_map_4]](%[[d0]], %[[i]], %[[c4]]) // LINALG: %[[sv1:.*]] = affine.min #[[$bounds_map_8]](%[[c8]], %[[j]], %[[c8]]) @@ -89,7 +87,7 @@ func @split_vector_transfer_read_2d(%A: memref, %i: index, %j: index) - // LINALG: scf.yield %[[yielded]], %[[c0]], %[[c0]] : // LINALG-SAME: memref, index, index // LINALG: } - // LINALG: %[[res:.*]] = vector.transfer_read %[[ifres]]#0[%[[ifres]]#1, %[[ifres]]#2], %[[cst]] + // LINALG: %[[res:.*]] = vector.transfer_read %[[ifres]]#0[%[[ifres]]#1, %[[ifres]]#2], %cst // LINALG_SAME: {masked = [false, false]} : memref, vector<4x8xf32> %1 = vector.transfer_read %A[%i, %j], %f0 : memref, vector<4x8xf32> @@ -112,10 +110,9 @@ func @split_vector_transfer_read_strided_2d( %c0 = constant 0 : index %f0 = constant 0.0 : f32 - // CHECK-DAG: %[[c0:.*]] = constant 0 : index // CHECK-DAG: %[[c7:.*]] = constant 7 : index // CHECK-DAG: %[[c8:.*]] = constant 8 : index - // CHECK-DAG: %[[cst:.*]] = constant 0.000000e+00 : f32 + // CHECK-DAG: %[[c0:.*]] = constant 0 : index // alloca for boundary full tile // CHECK: %[[alloc:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32> // %i + 4 <= dim(%A, 0) @@ -152,7 +149,6 @@ func @split_vector_transfer_read_strided_2d( // LINALG-DAG: %[[c4:.*]] = constant 4 : index // LINALG-DAG: %[[c7:.*]] = constant 7 : index // LINALG-DAG: %[[c8:.*]] = constant 8 : index - // LINALG-DAG: %[[cst:.*]] = constant 0.000000e+00 : f32 // alloca for boundary full tile // LINALG: %[[alloc:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32> // %i + 4 <= dim(%A, 0) @@ -171,7 +167,7 @@ func @split_vector_transfer_read_strided_2d( // LINALG-SAME: memref, index, index // LINALG: } else { // slow path, fill tmp alloc and yield a memref_casted version of it - // LINALG: linalg.fill(%[[alloc]], %[[cst]]) : memref<4x8xf32>, f32 + // LINALG: linalg.fill(%[[alloc]], %cst) : memref<4x8xf32>, f32 // LINALG: %[[sv0:.*]] = affine.min #[[$bounds_map_4]](%[[c7]], %[[i]], %[[c4]]) // LINALG: %[[sv1:.*]] = affine.min #[[$bounds_map_8]](%[[c8]], %[[j]], %[[c8]]) // LINALG: %[[sv:.*]] = memref.subview %[[A]][%[[i]], %[[j]]] [%[[sv0]], %[[sv1]]] [1, 1] diff --git a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir index d5e9535acb8e..15b68275decf 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-unroll.mlir @@ -1,8 +1,8 @@ // RUN: mlir-opt %s -test-vector-transfer-unrolling-patterns | FileCheck %s // CHECK-LABEL: func @transfer_read_unroll -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK: %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32> // CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32> // CHECK-NEXT: %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32> @@ -19,8 +19,8 @@ func @transfer_read_unroll(%arg0 : memref<4x4xf32>) -> vector<4x4xf32> { } // CHECK-LABEL: func @transfer_write_unroll -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK: %[[TUPL:.*]] = vector.extract_slices {{.*}}, [2, 2], [1, 1] : vector<4x4xf32> into tuple, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>> // CHECK-NEXT: %[[T0:.*]] = vector.tuple_get %[[TUPL]], 0 : tuple, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>> // CHECK-NEXT: vector.transfer_write %[[T0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, memref<4x4xf32> @@ -39,8 +39,8 @@ func @transfer_write_unroll(%arg0 : memref<4x4xf32>, %arg1 : vector<4x4xf32>) { } // CHECK-LABEL: func @transfer_readwrite_unroll -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK: %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32> // CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32> // CHECK-NEXT: %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32> @@ -60,8 +60,8 @@ func @transfer_readwrite_unroll(%arg0 : memref<4x4xf32>) { } // CHECK-LABEL: func @transfer_read_unroll_tensor -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK: %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32> // CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32> // CHECK-NEXT: %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32> @@ -78,8 +78,8 @@ func @transfer_read_unroll_tensor(%arg0 : tensor<4x4xf32>) -> vector<4x4xf32> { } // CHECK-LABEL: func @transfer_write_unroll_tensor -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK: %[[TUPL:.*]] = vector.extract_slices {{.*}}, [2, 2], [1, 1] : vector<4x4xf32> into tuple, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>> // CHECK-NEXT: %[[T0:.*]] = vector.tuple_get %[[TUPL]], 0 : tuple, vector<2x2xf32>, vector<2x2xf32>, vector<2x2xf32>> // CHECK-NEXT: %[[VTW0:.*]] = vector.transfer_write %[[T0]], {{.*}}[%[[C0]], %[[C0]]] {{.*}} : vector<2x2xf32>, tensor<4x4xf32> @@ -100,8 +100,8 @@ func @transfer_write_unroll_tensor(%arg0 : tensor<4x4xf32>, } // CHECK-LABEL: func @transfer_readwrite_unroll_tensor -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK: %[[VTR0:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32> // CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read {{.*}}[%[[C0]], %[[C2]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32> // CHECK-NEXT: %[[VTR2:.*]] = vector.transfer_read {{.*}}[%[[C2]], %[[C0]]], %{{.*}} : tensor<4x4xf32>, vector<2x2xf32> diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir index da899389de74..9388b67dd532 100644 --- a/mlir/test/Dialect/Vector/vector-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-transforms.mlir @@ -225,8 +225,8 @@ func @contraction4x4_ikj(%arg0 : vector<4x2xf32>, %arg1 : vector<2x4xf32>, // CHECK-LABEL: func @contraction4x4_ikj_xfer_read -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // Check LHS vector.transfer read is split for each user. @@ -422,8 +422,8 @@ func @cancelling_shape_cast_ops(%arg0 : vector<2x4xf32>) -> vector<2x4xf32> { } // CHECK-LABEL: func @vector_transfers_vector_element_type -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C1:.*]] = constant 1 : index +// CHECK-DAG: %[[C1:.*]] = constant 1 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK: %[[VTR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {masked = [false, false]} : memref<6x2x1xvector<2x4xf32>>, vector<1x1x2x4xf32> // CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C1]], %[[C0]]], %{{.*}} {masked = [false, false]} : memref<6x2x1xvector<2x4xf32>>, vector<1x1x2x4xf32> // CHECK-NEXT: vector.transfer_write %[[VTR0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {masked = [false, false]} : vector<1x1x2x4xf32>, memref<6x2x1xvector<2x4xf32>> @@ -516,8 +516,8 @@ func @shape_cast_fold(%arg0 : vector<5x4x2xf32>, %arg1 : vector<3x4x2xf32>) // CHECK-LABEL: func @elementwise_unroll // CHECK-SAME: (%[[ARG0:.*]]: memref<4x4xf32>, %[[ARG1:.*]]: memref<4x4xf32>) -// CHECK: %[[C0:.*]] = constant 0 : index -// CHECK: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C2:.*]] = constant 2 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK: %[[VT0:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], {{.*}} : memref<4x4xf32>, vector<2x2xf32> // CHECK: %[[VT1:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C2]]], {{.*}} : memref<4x4xf32>, vector<2x2xf32> // CHECK: %[[VT2:.*]] = vector.transfer_read %[[ARG0]][%[[C2]], %[[C0]]], {{.*}} : memref<4x4xf32>, vector<2x2xf32> diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir index 5009d0f4207d..a65c46452cc8 100644 --- a/mlir/test/Transforms/canonicalize.mlir +++ b/mlir/test/Transforms/canonicalize.mlir @@ -572,7 +572,8 @@ func @indirect_call_folding() { // // CHECK-LABEL: @lowered_affine_mod func @lowered_affine_mod() -> (index, index) { -// CHECK-NEXT: {{.*}} = constant 41 : index +// CHECK-DAG: {{.*}} = constant 1 : index +// CHECK-DAG: {{.*}} = constant 41 : index %c-43 = constant -43 : index %c42 = constant 42 : index %0 = remi_signed %c-43, %c42 : index @@ -580,7 +581,6 @@ func @lowered_affine_mod() -> (index, index) { %1 = cmpi slt, %0, %c0 : index %2 = addi %0, %c42 : index %3 = select %1, %2, %0 : index -// CHECK-NEXT: {{.*}} = constant 1 : index %c43 = constant 43 : index %c42_0 = constant 42 : index %4 = remi_signed %c43, %c42_0 : index @@ -598,7 +598,8 @@ func @lowered_affine_mod() -> (index, index) { // // CHECK-LABEL: func @lowered_affine_floordiv func @lowered_affine_floordiv() -> (index, index) { -// CHECK-NEXT: %c-2 = constant -2 : index +// CHECK-DAG: %c1 = constant 1 : index +// CHECK-DAG: %c-2 = constant -2 : index %c-43 = constant -43 : index %c42 = constant 42 : index %c0 = constant 0 : index @@ -609,7 +610,6 @@ func @lowered_affine_floordiv() -> (index, index) { %3 = divi_signed %2, %c42 : index %4 = subi %c-1, %3 : index %5 = select %0, %4, %3 : index -// CHECK-NEXT: %c1 = constant 1 : index %c43 = constant 43 : index %c42_0 = constant 42 : index %c0_1 = constant 0 : index @@ -724,17 +724,17 @@ func @view(%arg0 : index) -> (f32, f32, f32, f32) { // CHECK-LABEL: func @subview // CHECK-SAME: %[[ARG0:.*]]: index, %[[ARG1:.*]]: index func @subview(%arg0 : index, %arg1 : index) -> (index, index) { - // CHECK: %[[C0:.*]] = constant 0 : index + // Folded but reappears after subview folding into dim. + // CHECK-DAG: %[[C0:.*]] = constant 0 : index + // CHECK-DAG: %[[C7:.*]] = constant 7 : index + // CHECK-DAG: %[[C11:.*]] = constant 11 : index %c0 = constant 0 : index // CHECK-NOT: constant 1 : index %c1 = constant 1 : index // CHECK-NOT: constant 2 : index %c2 = constant 2 : index // Folded but reappears after subview folding into dim. - // CHECK: %[[C7:.*]] = constant 7 : index %c7 = constant 7 : index - // Folded but reappears after subview folding into dim. - // CHECK: %[[C11:.*]] = constant 11 : index %c11 = constant 11 : index // CHECK-NOT: constant 15 : index %c15 = constant 15 : index @@ -895,8 +895,8 @@ func @index_cast_fold() -> (i16, index) { %1 = index_cast %c4 : index to i16 %c4_i16 = constant 4 : i16 %2 = index_cast %c4_i16 : i16 to index - // CHECK: %[[C4_I16:.*]] = constant 4 : i16 - // CHECK: %[[C4:.*]] = constant 4 : index + // CHECK-DAG: %[[C4:.*]] = constant 4 : index + // CHECK-DAG: %[[C4_I16:.*]] = constant 4 : i16 // CHECK: return %[[C4_I16]], %[[C4]] : i16, index return %1, %2 : i16, index } diff --git a/mlir/test/Transforms/parallel-loop-collapsing.mlir b/mlir/test/Transforms/parallel-loop-collapsing.mlir index 2bd78be6b63a..a6a9aa8f61bd 100644 --- a/mlir/test/Transforms/parallel-loop-collapsing.mlir +++ b/mlir/test/Transforms/parallel-loop-collapsing.mlir @@ -28,15 +28,15 @@ func @parallel_many_dims() { return } -// CHECK: [[C3:%.*]] = constant 3 : index -// CHECK: [[C6:%.*]] = constant 6 : index -// CHECK: [[C9:%.*]] = constant 9 : index -// CHECK: [[C10:%.*]] = constant 10 : index -// CHECK: [[C4:%.*]] = constant 4 : index -// CHECK: [[C12:%.*]] = constant 12 : index -// CHECK: [[C0:%.*]] = constant 0 : index -// CHECK: [[C1:%.*]] = constant 1 : index -// CHECK: [[C2:%.*]] = constant 2 : index +// CHECK-DAG: [[C12:%.*]] = constant 12 : index +// CHECK-DAG: [[C10:%.*]] = constant 10 : index +// CHECK-DAG: [[C9:%.*]] = constant 9 : index +// CHECK-DAG: [[C6:%.*]] = constant 6 : index +// CHECK-DAG: [[C4:%.*]] = constant 4 : index +// CHECK-DAG: [[C3:%.*]] = constant 3 : index +// CHECK-DAG: [[C2:%.*]] = constant 2 : index +// CHECK-DAG: [[C1:%.*]] = constant 1 : index +// CHECK-DAG: [[C0:%.*]] = constant 0 : index // CHECK: scf.parallel ([[NEW_I0:%.*]]) = ([[C0]]) to ([[C4]]) step ([[C1]]) { // CHECK: [[V0:%.*]] = remi_signed [[NEW_I0]], [[C2]] : index // CHECK: [[I0:%.*]] = divi_signed [[NEW_I0]], [[C2]] : index diff --git a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir index 2a516c483c89..496f73568977 100644 --- a/mlir/test/Transforms/single-parallel-loop-collapsing.mlir +++ b/mlir/test/Transforms/single-parallel-loop-collapsing.mlir @@ -14,13 +14,13 @@ func @collapse_to_single() { } // CHECK-LABEL: func @collapse_to_single() { -// CHECK: [[C7:%.*]] = constant 7 : index -// CHECK: [[C3:%.*]] = constant 3 : index -// CHECK: [[C4:%.*]] = constant 4 : index -// CHECK: [[C18:%.*]] = constant 18 : index -// CHECK: [[C6:%.*]] = constant 6 : index -// CHECK: [[C0:%.*]] = constant 0 : index -// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK-DAG: [[C18:%.*]] = constant 18 : index +// CHECK-DAG: [[C6:%.*]] = constant 6 : index +// CHECK-DAG: [[C3:%.*]] = constant 3 : index +// CHECK-DAG: [[C7:%.*]] = constant 7 : index +// CHECK-DAG: [[C4:%.*]] = constant 4 : index +// CHECK-DAG: [[C1:%.*]] = constant 1 : index +// CHECK-DAG: [[C0:%.*]] = constant 0 : index // CHECK: scf.parallel ([[NEW_I:%.*]]) = ([[C0]]) to ([[C18]]) step ([[C1]]) { // CHECK: [[I0_COUNT:%.*]] = remi_signed [[NEW_I]], [[C6]] : index // CHECK: [[I1_COUNT:%.*]] = divi_signed [[NEW_I]], [[C6]] : index diff --git a/mlir/test/Transforms/test-canonicalize.mlir b/mlir/test/Transforms/test-canonicalize.mlir index cc6af03a7818..c0033a2409ec 100644 --- a/mlir/test/Transforms/test-canonicalize.mlir +++ b/mlir/test/Transforms/test-canonicalize.mlir @@ -52,6 +52,25 @@ func @test_commutative_multi(%arg0: i32, %arg1: i32) -> (i32, i32) { return %y, %z: i32, i32 } + +// CHECK-LABEL: func @test_commutative_multi_cst +func @test_commutative_multi_cst(%arg0: i32, %arg1: i32) -> (i32, i32) { + // CHECK-NEXT: %c42_i32 = constant 42 : i32 + %c42_i32 = constant 42 : i32 + %c42_i32_2 = constant 42 : i32 + // CHECK-NEXT: %[[O0:.*]] = "test.op_commutative"(%arg0, %arg1, %c42_i32, %c42_i32) : (i32, i32, i32, i32) -> i32 + %y = "test.op_commutative"(%c42_i32, %arg0, %arg1, %c42_i32_2) : (i32, i32, i32, i32) -> i32 + + %c42_i32_3 = constant 42 : i32 + + // CHECK-NEXT: %[[O1:.*]] = "test.op_commutative"(%arg0, %arg1, %c42_i32, %c42_i32) : (i32, i32, i32, i32) -> i32 + %z = "test.op_commutative"(%arg0, %c42_i32_3, %c42_i32_2, %arg1): (i32, i32, i32, i32) -> i32 + // CHECK-NEXT: return %[[O0]], %[[O1]] + return %y, %z: i32, i32 +} + +// CHECK-LABEL: func @typemismatch + func @typemismatch() -> i32 { %c42 = constant 42.0 : f32 -- GitLab From caddfbd2a94c7014173ce891fc0233d58b3c9db8 Mon Sep 17 00:00:00 2001 From: River Riddle Date: Fri, 19 Mar 2021 18:19:16 -0700 Subject: [PATCH 0223/1000] [mlir][docs] Remove the BuiltinDialect documentation from langref and generate it from ODS Now that all of the builtin dialect is generated from ODS, its documentation in LangRef can be split out and replaced with references to Dialects/Builtin.md. LangRef is quite crusty right now and should really have a full cleanup done in a followup. Differential Revision: https://reviews.llvm.org/D98562 --- mlir/docs/Diagnostics.md | 66 +- mlir/docs/Dialects/Builtin.md | 32 + mlir/docs/LangRef.md | 878 +-------------------------- mlir/include/mlir/IR/BuiltinTypes.td | 1 - mlir/include/mlir/IR/CMakeLists.txt | 5 +- mlir/tools/mlir-tblgen/OpDocGen.cpp | 141 +++-- 6 files changed, 157 insertions(+), 966 deletions(-) create mode 100644 mlir/docs/Dialects/Builtin.md diff --git a/mlir/docs/Diagnostics.md b/mlir/docs/Diagnostics.md index 9e1e4f9156c3..6300dd3dc06c 100644 --- a/mlir/docs/Diagnostics.md +++ b/mlir/docs/Diagnostics.md @@ -11,69 +11,9 @@ structure of the IR, operations, etc. ## Source Locations Source location information is extremely important for any compiler, because it -provides a baseline for debuggability and error-reporting. MLIR provides several -different location types depending on the situational need. - -### CallSite Location - -``` -callsite-location ::= 'callsite' '(' location 'at' location ')' -``` - -An instance of this location allows for representing a directed stack of -location usages. This connects a location of a `callee` with the location of a -`caller`. - -### FileLineCol Location - -``` -filelinecol-location ::= string-literal ':' integer-literal ':' integer-literal -``` - -An instance of this location represents a tuple of file, line number, and column -number. This is similar to the type of location that you get from most source -languages. - -### Fused Location - -``` -fused-location ::= `fused` fusion-metadata? '[' location (location ',')* ']' -fusion-metadata ::= '<' attribute-value '>' -``` - -An instance of a `fused` location represents a grouping of several other source -locations, with optional metadata that describes the context of the fusion. -There are many places within a compiler in which several constructs may be fused -together, e.g. pattern rewriting, that normally result partial or even total -loss of location information. With `fused` locations, this is a non-issue. - -### Name Location - -``` -name-location ::= string-literal ('(' location ')')? -``` - -An instance of this location allows for attaching a name to a child location. -This can be useful for representing the locations of variable, or node, -definitions. - -### Opaque Location - -An instance of this location essentially contains a pointer to some data -structure that is external to MLIR and an optional location that can be used if -the first one is not suitable. Since it contains an external structure, only the -optional location is used during serialization. - -### Unknown Location - -``` -unknown-location ::= `unknown` -``` - -Source location information is an extremely integral part of the MLIR -infrastructure. As such, location information is always present in the IR, and -must explicitly be set to unknown. Thus an instance of the `unknown` location, -represents an unspecified source location. +provides a baseline for debuggability and error-reporting. The +[builtin dialect](Dialects/Builtin.md) provides several different location +attributes types depending on the situational need. ## Diagnostic Engine diff --git a/mlir/docs/Dialects/Builtin.md b/mlir/docs/Dialects/Builtin.md new file mode 100644 index 000000000000..6a1bd365d2df --- /dev/null +++ b/mlir/docs/Dialects/Builtin.md @@ -0,0 +1,32 @@ +# Builtin Dialect + +The builtin dialect contains a core set of Attributes, Operations, and Types +that have wide applicability across a very large number of domains and +abstractions. Many of the components of this dialect are also instrumental in +the implementation of the core IR. As such, this dialect is implicitly loaded in +every `MLIRContext`, and available directly to all users of MLIR. + +Given the far-reaching nature of this dialect and the fact that MLIR is +extensible by design, any potential additions are heavily scrutinized. + +[TOC] + +## Attributes + +[include "Dialects/BuiltinAttributes.md"] + +## Location Attributes + +A subset of the builtin attribute values correspond to +[source locations](../Diagnostics.md#source-locations), that may be attached to +Operations. + +[include "Dialects/BuiltinLocationAttributes.md"] + +## Operations + +[include "Dialects/BuiltinOps.md"] + +## Types + +[include "Dialects/BuiltinTypes.md"] diff --git a/mlir/docs/LangRef.md b/mlir/docs/LangRef.md index 7b58b63258a5..82cbc973e1fd 100644 --- a/mlir/docs/LangRef.md +++ b/mlir/docs/LangRef.md @@ -60,14 +60,13 @@ Operation](docs/Tutorials/Toy/Ch-2/#op-vs-operation-using-mlir-operations)) One obvious application of MLIR is to represent an [SSA-based](https://en.wikipedia.org/wiki/Static_single_assignment_form) IR, -like the LLVM core IR, with appropriate choice of Operation Types to define -[Modules](#module), [Functions](#functions), Branches, Allocations, and -verification constraints to ensure the SSA Dominance property. MLIR includes a -'standard' dialect which defines just such structures. However, MLIR is -intended to be general enough to represent other compiler-like data -structures, such as Abstract Syntax Trees in a language frontend, generated -instructions in a target-specific backend, or circuits in a High-Level -Synthesis tool. +like the LLVM core IR, with appropriate choice of operation types to define +Modules, Functions, Branches, Memory Allocation, and verification constraints to +ensure the SSA Dominance property. MLIR includes a collection of dialects which +defines just such structures. However, MLIR is intended to be general enough to +represent other compiler-like data structures, such as Abstract Syntax Trees in +a language frontend, generated instructions in a target-specific backend, or +circuits in a High-Level Synthesis tool. Here's an example of an MLIR module: @@ -328,96 +327,12 @@ In addition to the basic syntax above, dialects may register known operations. This allows those dialects to support _custom assembly form_ for parsing and printing operations. In the operation sets listed below, we show both forms. -### Terminator Operations +### Builtin Operations -These are a special category of operations that *must* terminate a block, e.g. -[branches](Dialects/Standard.md#terminator-operations). These operations may -also have a list of successors ([blocks](#blocks) and their arguments). - -Example: - -```mlir -// Branch to ^bb1 or ^bb2 depending on the condition %cond. -// Pass value %v to ^bb2, but not to ^bb1. -"cond_br"(%cond)[^bb1, ^bb2(%v : index)] : (i1) -> () -``` - -### Module - -``` -module ::= `module` symbol-ref-id? (`attributes` dictionary-attribute)? region -``` - -An MLIR Module represents a top-level container operation. It contains a single -[SSACFG region](#control-flow-and-ssacfg-regions) containing a single block -which can contain any operations. Operations within this region cannot -implicitly capture values defined outside the module, i.e. Modules are -[IsolatedFromAbove](Traits.md#isolatedfromabove). Modules have an optional -[symbol name](SymbolsAndSymbolTables.md) which can be used to refer to them in -operations. - -### Functions - -An MLIR Function is an operation with a name containing a single [SSACFG -region](#control-flow-and-ssacfg-regions). Operations within this region -cannot implicitly capture values defined outside of the function, -i.e. Functions are [IsolatedFromAbove](Traits.md#isolatedfromabove). All -external references must use function arguments or attributes that establish a -symbolic connection (e.g. symbols referenced by name via a string attribute -like [SymbolRefAttr](#symbol-reference-attribute)): - -``` -function ::= `func` function-signature function-attributes? function-body? - -function-signature ::= symbol-ref-id `(` argument-list `)` - (`->` function-result-list)? - -argument-list ::= (named-argument (`,` named-argument)*) | /*empty*/ -argument-list ::= (type dictionary-attribute? (`,` type dictionary-attribute?)*) - | /*empty*/ -named-argument ::= value-id `:` type dictionary-attribute? - -function-result-list ::= function-result-list-parens - | non-function-type -function-result-list-parens ::= `(` `)` - | `(` function-result-list-no-parens `)` -function-result-list-no-parens ::= function-result (`,` function-result)* -function-result ::= type dictionary-attribute? - -function-attributes ::= `attributes` dictionary-attribute -function-body ::= region -``` - -An external function declaration (used when referring to a function declared -in some other module) has no body. While the MLIR textual form provides a nice -inline syntax for function arguments, they are internally represented as -"block arguments" to the first block in the region. - -Only dialect attribute names may be specified in the attribute dictionaries -for function arguments, results, or the function itself. - -Examples: - -```mlir -// External function definitions. -func @abort() -func @scribble(i32, i64, memref) -> f64 - -// A function that returns its argument twice: -func @count(%x: i64) -> (i64, i64) - attributes {fruit: "banana"} { - return %x, %x: i64, i64 -} - -// A function with an argument attribute -func @example_fn_arg(%x: i32 {swift.self = unit}) - -// A function with a result attribute -func @example_fn_result() -> (f64 {dialectName.attrName = 0 : i64}) - -// A function with an attribute -func @example_fn_attr() attributes {dialectName.attrName = false} -``` +The [builtin dialect](Dialects/Builtin.md) defines a select few operations that +are widely applicable by MLIR dialects, such as a universal conversion cast +operation that simplifies inter/intra dialect conversion. This dialect also +defines a top-level `module` operation, that represents a useful IR container. ## Blocks @@ -701,14 +616,10 @@ defines the relation between the region results and the operation results. ## Type System -Each value in MLIR has a type defined by the type system below. There are a -number of primitive types (like integers) and also aggregate types for tensors -and memory buffers. MLIR [builtin types](#builtin-types) do not include -structures, arrays, or dictionaries. - -MLIR has an open type system (i.e. there is no fixed list of types), and types -may have application-specific semantics. For example, MLIR supports a set of -[dialect types](#dialect-types). +Each value in MLIR has a type defined by the type system. MLIR has an open type +system (i.e. there is no fixed list of types), and types may have +application-specific semantics. MLIR dialects may define any number of types +with no restrictions on the abstractions they represent. ``` type ::= type-alias | dialect-type | builtin-type @@ -806,497 +717,14 @@ the lighter syntax: `!foo.something>>` because it contains characters that are not allowed in the lighter syntax, as well as unbalanced `<>` characters. -See [here](Tutorials/DefiningAttributesAndTypes.md) to learn how to define dialect types. +See [here](Tutorials/DefiningAttributesAndTypes.md) to learn how to define +dialect types. ### Builtin Types -Builtin types are a core set of [dialect types](#dialect-types) that are defined -in a builtin dialect and thus available to all users of MLIR. - -``` -builtin-type ::= complex-type - | float-type - | function-type - | index-type - | integer-type - | memref-type - | none-type - | tensor-type - | tuple-type - | vector-type -``` - -#### Complex Type - -Syntax: - -``` -complex-type ::= `complex` `<` type `>` -``` - -The value of `complex` type represents a complex number with a parameterized -element type, which is composed of a real and imaginary value of that element -type. The element must be a floating point or integer scalar type. - -Examples: - -```mlir -complex -complex -``` - -#### Floating Point Types - -Syntax: - -``` -// Floating point. -float-type ::= `f16` | `bf16` | `f32` | `f64` | `f80` | `f128` -``` - -MLIR supports float types of certain widths that are widely used as indicated -above. - -#### Function Type - -Syntax: - -``` -// MLIR functions can return multiple values. -function-result-type ::= type-list-parens - | non-function-type - -function-type ::= type-list-parens `->` function-result-type -``` - -MLIR supports first-class functions: for example, the -[`constant` operation](Dialects/Standard.md#stdconstant-constantop) produces the -address of a function as a value. This value may be passed to and -returned from functions, merged across control flow boundaries with -[block arguments](#blocks), and called with the -[`call_indirect` operation](Dialects/Standard.md#call-indirect-operation). - -Function types are also used to indicate the arguments and results of -[operations](#operations). - -#### Index Type - -Syntax: - -``` -// Target word-sized integer. -index-type ::= `index` -``` - -The `index` type is a signless integer whose size is equal to the natural -machine word of the target -([rationale](Rationale/Rationale.md#integer-signedness-semantics)) and is used -by the affine constructs in MLIR. Unlike fixed-size integers, it cannot be used -as an element of vector -([rationale](Rationale/Rationale.md#index-type-disallowed-in-vector-types)). - -**Rationale:** integers of platform-specific bit widths are practical to express -sizes, dimensionalities and subscripts. - -#### Integer Type - -Syntax: - -``` -// Sized integers like i1, i4, i8, i16, i32. -signed-integer-type ::= `si` [1-9][0-9]* -unsigned-integer-type ::= `ui` [1-9][0-9]* -signless-integer-type ::= `i` [1-9][0-9]* -integer-type ::= signed-integer-type | - unsigned-integer-type | - signless-integer-type -``` - -MLIR supports arbitrary precision integer types. Integer types have a designated -width and may have signedness semantics. - -**Rationale:** low precision integers (like `i2`, `i4` etc) are useful for -low-precision inference chips, and arbitrary precision integers are useful for -hardware synthesis (where a 13 bit multiplier is a lot cheaper/smaller than a 16 -bit one). - -TODO: Need to decide on a representation for quantized integers -([initial thoughts](Rationale/Rationale.md#quantized-integer-operations)). - -#### Memref Type - -Syntax: - -``` -memref-type ::= ranked-memref-type | unranked-memref-type - -ranked-memref-type ::= `memref` `<` dimension-list-ranked type - (`,` layout-specification)? (`,` memory-space)? `>` - -unranked-memref-type ::= `memref` `<*x` type (`,` memory-space)? `>` - -stride-list ::= `[` (dimension (`,` dimension)*)? `]` -strided-layout ::= `offset:` dimension `,` `strides: ` stride-list -semi-affine-map-composition ::= (semi-affine-map `,` )* semi-affine-map -layout-specification ::= semi-affine-map-composition | strided-layout -memory-space ::= integer-literal /* | TODO: address-space-id */ -``` - -A `memref` type is a reference to a region of memory (similar to a buffer -pointer, but more powerful). The buffer pointed to by a memref can be allocated, -aliased and deallocated. A memref can be used to read and write data from/to the -memory region which it references. Memref types use the same shape specifier as -tensor types. Note that `memref`, `memref<0 x f32>`, `memref<1 x 0 x f32>`, -and `memref<0 x 1 x f32>` are all different types. - -A `memref` is allowed to have an unknown rank (e.g. `memref<*xf32>`). The -purpose of unranked memrefs is to allow external library functions to receive -memref arguments of any rank without versioning the functions based on the rank. -Other uses of this type are disallowed or will have undefined behavior. - -##### Codegen of Unranked Memref - -Using unranked memref in codegen besides the case mentioned above is highly -discouraged. Codegen is concerned with generating loop nests and specialized -instructions for high-performance, unranked memref is concerned with hiding the -rank and thus, the number of enclosing loops required to iterate over the data. -However, if there is a need to code-gen unranked memref, one possible path is to -cast into a static ranked type based on the dynamic rank. Another possible path -is to emit a single while loop conditioned on a linear index and perform -delinearization of the linear index to a dynamic array containing the (unranked) -indices. While this is possible, it is expected to not be a good idea to perform -this during codegen as the cost of the translations is expected to be -prohibitive and optimizations at this level are not expected to be worthwhile. -If expressiveness is the main concern, irrespective of performance, passing -unranked memrefs to an external C++ library and implementing rank-agnostic logic -there is expected to be significantly simpler. - -Unranked memrefs may provide expressiveness gains in the future and help bridge -the gap with unranked tensors. Unranked memrefs will not be expected to be -exposed to codegen but one may query the rank of an unranked memref (a special -op will be needed for this purpose) and perform a switch and cast to a ranked -memref as a prerequisite to codegen. - -Example: - -```mlir -// With static ranks, we need a function for each possible argument type -%A = alloc() : memref<16x32xf32> -%B = alloc() : memref<16x32x64xf32> -call @helper_2D(%A) : (memref<16x32xf32>)->() -call @helper_3D(%B) : (memref<16x32x64xf32>)->() - -// With unknown rank, the functions can be unified under one unranked type -%A = alloc() : memref<16x32xf32> -%B = alloc() : memref<16x32x64xf32> -// Remove rank info -%A_u = memref_cast %A : memref<16x32xf32> -> memref<*xf32> -%B_u = memref_cast %B : memref<16x32x64xf32> -> memref<*xf32> -// call same function with dynamic ranks -call @helper(%A_u) : (memref<*xf32>)->() -call @helper(%B_u) : (memref<*xf32>)->() -``` - -The core syntax and representation of a layout specification is a -[semi-affine map](Dialects/Affine.md#semi-affine-maps). Additionally, syntactic -sugar is supported to make certain layout specifications more intuitive to read. -For the moment, a `memref` supports parsing a strided form which is converted to -a semi-affine map automatically. - -The memory space of a memref is specified by a target-specific attribute. -It might be an integer value, string, dictionary or custom dialect attribute. -The empty memory space (attribute is None) is target specific. - -The notionally dynamic value of a memref value includes the address of the -buffer allocated, as well as the symbols referred to by the shape, layout map, -and index maps. - -Examples of memref static type - -```mlir -// Identity index/layout map -#identity = affine_map<(d0, d1) -> (d0, d1)> - -// Column major layout. -#col_major = affine_map<(d0, d1, d2) -> (d2, d1, d0)> - -// A 2-d tiled layout with tiles of size 128 x 256. -#tiled_2d_128x256 = affine_map<(d0, d1) -> (d0 div 128, d1 div 256, d0 mod 128, d1 mod 256)> - -// A tiled data layout with non-constant tile sizes. -#tiled_dynamic = affine_map<(d0, d1)[s0, s1] -> (d0 floordiv s0, d1 floordiv s1, - d0 mod s0, d1 mod s1)> - -// A layout that yields a padding on two at either end of the minor dimension. -#padded = affine_map<(d0, d1) -> (d0, (d1 + 2) floordiv 2, (d1 + 2) mod 2)> - - -// The dimension list "16x32" defines the following 2D index space: -// -// { (i, j) : 0 <= i < 16, 0 <= j < 32 } -// -memref<16x32xf32, #identity> - -// The dimension list "16x4x?" defines the following 3D index space: -// -// { (i, j, k) : 0 <= i < 16, 0 <= j < 4, 0 <= k < N } -// -// where N is a symbol which represents the runtime value of the size of -// the third dimension. -// -// %N here binds to the size of the third dimension. -%A = alloc(%N) : memref<16x4x?xf32, #col_major> - -// A 2-d dynamic shaped memref that also has a dynamically sized tiled layout. -// The memref index space is of size %M x %N, while %B1 and %B2 bind to the -// symbols s0, s1 respectively of the layout map #tiled_dynamic. Data tiles of -// size %B1 x %B2 in the logical space will be stored contiguously in memory. -// The allocation size will be (%M ceildiv %B1) * %B1 * (%N ceildiv %B2) * %B2 -// f32 elements. -%T = alloc(%M, %N) [%B1, %B2] : memref - -// A memref that has a two-element padding at either end. The allocation size -// will fit 16 * 64 float elements of data. -%P = alloc() : memref<16x64xf32, #padded> - -// Affine map with symbol 's0' used as offset for the first dimension. -#imapS = affine_map<(d0, d1) [s0] -> (d0 + s0, d1)> -// Allocate memref and bind the following symbols: -// '%n' is bound to the dynamic second dimension of the memref type. -// '%o' is bound to the symbol 's0' in the affine map of the memref type. -%n = ... -%o = ... -%A = alloc (%n)[%o] : <16x?xf32, #imapS> -``` - -##### Index Space - -A memref dimension list defines an index space within which the memref can be -indexed to access data. - -##### Index - -Data is accessed through a memref type using a multidimensional index into the -multidimensional index space defined by the memref's dimension list. - -Examples - -```mlir -// Allocates a memref with 2D index space: -// { (i, j) : 0 <= i < 16, 0 <= j < 32 } -%A = alloc() : memref<16x32xf32, #imapA> - -// Loads data from memref '%A' using a 2D index: (%i, %j) -%v = load %A[%i, %j] : memref<16x32xf32, #imapA> -``` - -##### Index Map - -An index map is a one-to-one -[semi-affine map](Dialects/Affine.md#semi-affine-maps) that transforms a -multidimensional index from one index space to another. For example, the -following figure shows an index map which maps a 2-dimensional index from a 2x2 -index space to a 3x3 index space, using symbols `S0` and `S1` as offsets. - -![Index Map Example](/includes/img/index-map.svg) - -The number of domain dimensions and range dimensions of an index map can be -different, but must match the number of dimensions of the input and output index -spaces on which the map operates. The index space is always non-negative and -integral. In addition, an index map must specify the size of each of its range -dimensions onto which it maps. Index map symbols must be listed in order with -symbols for dynamic dimension sizes first, followed by other required symbols. - -##### Layout Map - -A layout map is a [semi-affine map](Dialects/Affine.md#semi-affine-maps) which -encodes logical to physical index space mapping, by mapping input dimensions to -their ordering from most-major (slowest varying) to most-minor (fastest -varying). Therefore, an identity layout map corresponds to a row-major layout. -Identity layout maps do not contribute to the MemRef type identification and are -discarded on construction. That is, a type with an explicit identity map is -`memref(i,j)>` is strictly the same as the one without layout -maps, `memref`. - -Layout map examples: - -```mlir -// MxN matrix stored in row major layout in memory: -#layout_map_row_major = (i, j) -> (i, j) - -// MxN matrix stored in column major layout in memory: -#layout_map_col_major = (i, j) -> (j, i) - -// MxN matrix stored in a 2-d blocked/tiled layout with 64x64 tiles. -#layout_tiled = (i, j) -> (i floordiv 64, j floordiv 64, i mod 64, j mod 64) -``` - -##### Affine Map Composition - -A memref specifies a semi-affine map composition as part of its type. A -semi-affine map composition is a composition of semi-affine maps beginning with -zero or more index maps, and ending with a layout map. The composition must be -conformant: the number of dimensions of the range of one map, must match the -number of dimensions of the domain of the next map in the composition. - -The semi-affine map composition specified in the memref type, maps from accesses -used to index the memref in load/store operations to other index spaces (i.e. -logical to physical index mapping). Each of the -[semi-affine maps](Dialects/Affine.md) and thus its composition is required to -be one-to-one. - -The semi-affine map composition can be used in dependence analysis, memory -access pattern analysis, and for performance optimizations like vectorization, -copy elision and in-place updates. If an affine map composition is not specified -for the memref, the identity affine map is assumed. - -##### Strided MemRef - -A memref may specify strides as part of its type. A stride specification is a -list of integer values that are either static or `?` (dynamic case). Strides -encode the distance, in number of elements, in (linear) memory between -successive entries along a particular dimension. A stride specification is -syntactic sugar for an equivalent strided memref representation using -semi-affine maps. For example, `memref<42x16xf32, offset: 33, strides: [1, 64]>` -specifies a non-contiguous memory region of `42` by `16` `f32` elements such -that: - -1. the minimal size of the enclosing memory region must be `33 + 42 * 1 + 16 * - 64 = 1066` elements; -2. the address calculation for accessing element `(i, j)` computes `33 + i + - 64 * j` -3. the distance between two consecutive elements along the inner dimension is - `1` element and the distance between two consecutive elements along the - outer dimension is `64` elements. - -This corresponds to a column major view of the memory region and is internally -represented as the type `memref<42x16xf32, (i, j) -> (33 + i + 64 * j)>`. - -The specification of strides must not alias: given an n-D strided memref, -indices `(i1, ..., in)` and `(j1, ..., jn)` may not refer to the same memory -address unless `i1 == j1, ..., in == jn`. - -Strided memrefs represent a view abstraction over preallocated data. They are -constructed with special ops, yet to be introduced. Strided memrefs are a -special subclass of memrefs with generic semi-affine map and correspond to a -normalized memref descriptor when lowering to LLVM. - -#### None Type - -Syntax: - -``` -none-type ::= `none` -``` - -The `none` type is a unit type, i.e. a type with exactly one possible value, -where its value does not have a defined dynamic representation. - -#### Tensor Type - -Syntax: - -``` -tensor-type ::= `tensor` `<` dimension-list type `>` - -dimension-list ::= dimension-list-ranked | (`*` `x`) -dimension-list-ranked ::= (dimension `x`)* -dimension ::= `?` | decimal-literal -``` - -Values with tensor type represents aggregate N-dimensional data values, and -have a known element type. It may have an unknown rank (indicated by `*`) or may -have a fixed rank with a list of dimensions. Each dimension may be a static -non-negative decimal constant or be dynamically determined (indicated by `?`). - -The runtime representation of the MLIR tensor type is intentionally abstracted - -you cannot control layout or get a pointer to the data. For low level buffer -access, MLIR has a [`memref` type](#memref-type). This abstracted runtime -representation holds both the tensor data values as well as information about -the (potentially dynamic) shape of the tensor. The -[`dim` operation](Dialects/Standard.md#dim-operation) returns the size of a -dimension from a value of tensor type. - -Note: hexadecimal integer literals are not allowed in tensor type declarations -to avoid confusion between `0xf32` and `0 x f32`. Zero sizes are allowed in -tensors and treated as other sizes, e.g., `tensor<0 x 1 x i32>` and `tensor<1 x -0 x i32>` are different types. Since zero sizes are not allowed in some other -types, such tensors should be optimized away before lowering tensors to vectors. - -Examples: - -```mlir -// Tensor with unknown rank. -tensor<* x f32> - -// Known rank but unknown dimensions. -tensor - -// Partially known dimensions. -tensor - -// Full static shape. -tensor<17 x 4 x 13 x 4 x f32> - -// Tensor with rank zero. Represents a scalar. -tensor - -// Zero-element dimensions are allowed. -tensor<0 x 42 x f32> - -// Zero-element tensor of f32 type (hexadecimal literals not allowed here). -tensor<0xf32> -``` - -#### Tuple Type - -Syntax: - -``` -tuple-type ::= `tuple` `<` (type ( `,` type)*)? `>` -``` - -The value of `tuple` type represents a fixed-size collection of elements, where -each element may be of a different type. - -**Rationale:** Though this type is first class in the type system, MLIR provides -no standard operations for operating on `tuple` types -([rationale](Rationale/Rationale.md#tuple-types)). - -Examples: - -```mlir -// Empty tuple. -tuple<> - -// Single element -tuple - -// Many elements. -tuple, i5> -``` - -#### Vector Type - -Syntax: - -``` -vector-type ::= `vector` `<` static-dimension-list vector-element-type `>` -vector-element-type ::= float-type | integer-type - -static-dimension-list ::= (decimal-literal `x`)+ -``` - -The vector type represents a SIMD style vector, used by target-specific -operation sets like AVX. While the most common use is for 1D vectors (e.g. -vector<16 x f32>) we also support multidimensional registers on targets that -support them (like TPUs). - -Vector shapes must be positive decimal integers. - -Note: hexadecimal integer literals are not allowed in vector type declarations, -`vector<0x42xi32>` is invalid because it is interpreted as a 2D vector with -shape `(0, 42)` and zero shapes are not allowed. +The [builtin dialect](Dialects/Builtin.md) defines a set of types that are +directly usable by any other dialect in MLIR. These types cover a range from +primitive integer and floating-point types, function types, and more. ## Attributes @@ -1401,263 +829,7 @@ attribute values. ### Builtin Attribute Values -Builtin attributes are a core set of -[dialect attribute values](#dialect-attribute-values) that are defined in a -builtin dialect and thus available to all users of MLIR. - -``` -builtin-attribute ::= affine-map-attribute - | array-attribute - | bool-attribute - | dictionary-attribute - | elements-attribute - | float-attribute - | integer-attribute - | integer-set-attribute - | string-attribute - | symbol-ref-attribute - | type-attribute - | unit-attribute -``` - -#### AffineMap Attribute - -Syntax: - -``` -affine-map-attribute ::= `affine_map` `<` affine-map `>` -``` - -An affine-map attribute is an attribute that represents an affine-map object. - -#### Array Attribute - -Syntax: - -``` -array-attribute ::= `[` (attribute-value (`,` attribute-value)*)? `]` -``` - -An array attribute is an attribute that represents a collection of attribute -values. - -#### Boolean Attribute - -Syntax: - -``` -bool-attribute ::= bool-literal -``` - -A boolean attribute is a literal attribute that represents a one-bit boolean -value, true or false. - -#### Dictionary Attribute - -Syntax: - -``` -dictionary-attribute ::= `{` (attribute-entry (`,` attribute-entry)*)? `}` -``` - -A dictionary attribute is an attribute that represents a sorted collection of -named attribute values. The elements are sorted by name, and each name must be -unique within the collection. - -#### Elements Attributes - -Syntax: - -``` -elements-attribute ::= dense-elements-attribute - | opaque-elements-attribute - | sparse-elements-attribute -``` - -An elements attribute is a literal attribute that represents a constant -[vector](#vector-type) or [tensor](#tensor-type) value. - -##### Dense Elements Attribute - -Syntax: - -``` -dense-elements-attribute ::= `dense` `<` attribute-value `>` `:` - ( tensor-type | vector-type ) -``` - -A dense elements attribute is an elements attribute where the storage for the -constant vector or tensor value has been densely packed. The attribute supports -storing integer or floating point elements, with integer/index/floating element -types. It also support storing string elements with a custom dialect string -element type. - -##### Opaque Elements Attribute - -Syntax: - -``` -opaque-elements-attribute ::= `opaque` `<` dialect-namespace `,` - hex-string-literal `>` `:` - ( tensor-type | vector-type ) -``` - -An opaque elements attribute is an elements attribute where the content of the -value is opaque. The representation of the constant stored by this elements -attribute is only understood, and thus decodable, by the dialect that created -it. - -Note: The parsed string literal must be in hexadecimal form. - -##### Sparse Elements Attribute - -Syntax: - -``` -sparse-elements-attribute ::= `sparse` `<` attribute-value `,` attribute-value - `>` `:` ( tensor-type | vector-type ) -``` - -A sparse elements attribute is an elements attribute that represents a sparse -vector or tensor object. This is where very few of the elements are non-zero. - -The attribute uses COO (coordinate list) encoding to represent the sparse -elements of the elements attribute. The indices are stored via a 2-D tensor of -64-bit integer elements with shape [N, ndims], which specifies the indices of -the elements in the sparse tensor that contains non-zero values. The element -values are stored via a 1-D tensor with shape [N], that supplies the -corresponding values for the indices. - -Example: - -```mlir - sparse<[[0, 0], [1, 2]], [1, 5]> : tensor<3x4xi32> - -// This represents the following tensor: -/// [[1, 0, 0, 0], -/// [0, 0, 5, 0], -/// [0, 0, 0, 0]] -``` - -#### Float Attribute - -Syntax: - -``` -float-attribute ::= (float-literal (`:` float-type)?) - | (hexadecimal-literal `:` float-type) -``` - -A float attribute is a literal attribute that represents a floating point value -of the specified [float type](#floating-point-types). It can be represented in -the hexadecimal form where the hexadecimal value is interpreted as bits of the -underlying binary representation. This form is useful for representing infinity -and NaN floating point values. To avoid confusion with integer attributes, -hexadecimal literals _must_ be followed by a float type to define a float -attribute. - -Examples: - -``` -42.0 // float attribute defaults to f64 type -42.0 : f32 // float attribute of f32 type -0x7C00 : f16 // positive infinity -0x7CFF : f16 // NaN (one of possible values) -42 : f32 // Error: expected integer type -``` - -#### Integer Attribute - -Syntax: - -``` -integer-attribute ::= integer-literal ( `:` (index-type | integer-type) )? -``` - -An integer attribute is a literal attribute that represents an integral value of -the specified integer or index type. The default type for this attribute, if one -is not specified, is a 64-bit integer. - -##### Integer Set Attribute - -Syntax: - -``` -integer-set-attribute ::= `affine_set` `<` integer-set `>` -``` - -An integer-set attribute is an attribute that represents an integer-set object. - -#### String Attribute - -Syntax: - -``` -string-attribute ::= string-literal (`:` type)? -``` - -A string attribute is an attribute that represents a string literal value. - -#### Symbol Reference Attribute - -Syntax: - -``` -symbol-ref-attribute ::= symbol-ref-id (`::` symbol-ref-id)* -``` - -A symbol reference attribute is a literal attribute that represents a named -reference to an operation that is nested within an operation with the -`OpTrait::SymbolTable` trait. As such, this reference is given meaning by the -nearest parent operation containing the `OpTrait::SymbolTable` trait. It may -optionally contain a set of nested references that further resolve to a symbol -nested within a different symbol table. - -This attribute can only be held internally by -[array attributes](#array-attribute) and -[dictionary attributes](#dictionary-attribute)(including the top-level operation -attribute dictionary), i.e. no other attribute kinds such as Locations or -extended attribute kinds. - -**Rationale:** Identifying accesses to global data is critical to -enabling efficient multi-threaded compilation. Restricting global -data access to occur through symbols and limiting the places that can -legally hold a symbol reference simplifies reasoning about these data -accesses. - -See [`Symbols And SymbolTables`](SymbolsAndSymbolTables.md) for more -information. - -#### Type Attribute - -Syntax: - -``` -type-attribute ::= type -``` - -A type attribute is an attribute that represents a [type object](#type-system). - -#### Unit Attribute - -``` -unit-attribute ::= `unit` -``` - -A unit attribute is an attribute that represents a value of `unit` type. The -`unit` type allows only one value forming a singleton set. This attribute value -is used to represent attributes that only have meaning from their existence. - -One example of such an attribute could be the `swift.self` attribute. This -attribute indicates that a function parameter is the self/context parameter. It -could be represented as a [boolean attribute](#boolean-attribute)(true or -false), but a value of false doesn't really bring any value. The parameter -either is the self/context or it isn't. - -```mlir -// A unit attribute defined with the `unit` value specifier. -func @verbose_form(i1) attributes {dialectName.unitAttr = unit} - -// A unit attribute can also be defined without the value specifier. -func @simple_form(i1) attributes {dialectName.unitAttr} -``` +The [builtin dialect](Dialects/Builtin.md) defines a set of attribute values +that are directly usable by any other dialect in MLIR. These types cover a range +from primitive integer and floating-point values, attribute dictionaries, dense +multi-dimensional arrays, and more. diff --git a/mlir/include/mlir/IR/BuiltinTypes.td b/mlir/include/mlir/IR/BuiltinTypes.td index 02f699ab3628..22d194db3b68 100644 --- a/mlir/include/mlir/IR/BuiltinTypes.td +++ b/mlir/include/mlir/IR/BuiltinTypes.td @@ -131,7 +131,6 @@ def Builtin_Function : Builtin_Type<"Function"> { The function type can be thought of as a function signature. It consists of a list of formal parameter types and a list of formal result types. - ``` }]; let parameters = (ins "ArrayRef":$inputs, "ArrayRef":$results); let builders = [ diff --git a/mlir/include/mlir/IR/CMakeLists.txt b/mlir/include/mlir/IR/CMakeLists.txt index e44e5dc218b1..963d6a87eee2 100644 --- a/mlir/include/mlir/IR/CMakeLists.txt +++ b/mlir/include/mlir/IR/CMakeLists.txt @@ -26,4 +26,7 @@ mlir_tablegen(BuiltinTypes.h.inc -gen-typedef-decls) mlir_tablegen(BuiltinTypes.cpp.inc -gen-typedef-defs) add_public_tablegen_target(MLIRBuiltinTypesIncGen) -add_mlir_doc(BuiltinOps -gen-dialect-doc Builtin Dialects/) +add_mlir_doc(BuiltinAttributes -gen-attrdef-doc BuiltinAttributes Dialects/) +add_mlir_doc(BuiltinLocationAttributes -gen-attrdef-doc BuiltinLocationAttributes Dialects/) +add_mlir_doc(BuiltinOps -gen-op-doc BuiltinOps Dialects/) +add_mlir_doc(BuiltinTypes -gen-typedef-doc BuiltinTypes Dialects/) diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index 45325deb2b79..d869aed8cb7d 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -162,46 +162,51 @@ static void emitTypeDoc(const Type &type, raw_ostream &os) { // TypeDef Documentation //===----------------------------------------------------------------------===// -/// Emit the assembly format of a type. -static void emitTypeAssemblyFormat(TypeDef td, raw_ostream &os) { +static void emitAttrOrTypeDefAssemblyFormat(const AttrOrTypeDef &def, + raw_ostream &os) { SmallVector parameters; - td.getParameters(parameters); - if (parameters.size() == 0) { - os << "\nSyntax: `!" << td.getDialect().getName() << "." << td.getMnemonic() - << "`\n"; + def.getParameters(parameters); + if (parameters.empty()) { + os << "\nSyntax: `!" << def.getDialect().getName() << "." + << def.getMnemonic() << "`\n"; return; } - os << "\nSyntax:\n\n```\n!" << td.getDialect().getName() << "." - << td.getMnemonic() << "<\n"; - for (auto *it = parameters.begin(), *e = parameters.end(); it < e; ++it) { - os << " " << it->getSyntax(); - if (it < parameters.end() - 1) + os << "\nSyntax:\n\n```\n!" << def.getDialect().getName() << "." + << def.getMnemonic() << "<\n"; + for (auto it : llvm::enumerate(parameters)) { + const AttrOrTypeParameter ¶m = it.value(); + os << " " << param.getSyntax(); + if (it.index() < (parameters.size() - 1)) os << ","; - os << " # " << it->getName() << "\n"; + os << " # " << param.getName() << "\n"; } os << ">\n```\n"; } -static void emitTypeDefDoc(TypeDef td, raw_ostream &os) { - os << llvm::formatv("### `{0}` ({1})\n", td.getName(), td.getCppClassName()); +static void emitAttrOrTypeDefDoc(const AttrOrTypeDef &def, raw_ostream &os) { + os << llvm::formatv("### {0}\n", def.getCppClassName()); - // Emit the summary, syntax, and description if present. - if (td.hasSummary()) - os << "\n" << td.getSummary() << "\n"; - if (td.getMnemonic() && td.getPrinterCode() && *td.getPrinterCode() == "" && - td.getParserCode() && *td.getParserCode() == "") - emitTypeAssemblyFormat(td, os); - if (td.hasDescription()) { + // Emit the summary if present. + if (def.hasSummary()) + os << "\n" << def.getSummary() << "\n"; + + // Emit the syntax if present. + if (def.getMnemonic() && def.getPrinterCode() == StringRef() && + def.getParserCode() == StringRef()) + emitAttrOrTypeDefAssemblyFormat(def, os); + + // Emit the description if present. + if (def.hasDescription()) { os << "\n"; - mlir::tblgen::emitDescription(td.getDescription(), os); + mlir::tblgen::emitDescription(def.getDescription(), os); } - // Emit attribute documentation. + // Emit parameter documentation. SmallVector parameters; - td.getParameters(parameters); + def.getParameters(parameters); if (!parameters.empty()) { - os << "\n#### Type parameters:\n\n"; + os << "\n#### Parameters:\n\n"; os << "| Parameter | C++ type | Description |\n" << "| :-------: | :-------: | ----------- |\n"; for (const auto &it : parameters) { @@ -214,24 +219,35 @@ static void emitTypeDefDoc(TypeDef td, raw_ostream &os) { os << "\n"; } +static void emitAttrOrTypeDefDoc(const RecordKeeper &recordKeeper, + raw_ostream &os, StringRef recordTypeName) { + std::vector defs = + recordKeeper.getAllDerivedDefinitions(recordTypeName); + + os << "\n"; + for (const llvm::Record *def : defs) + emitAttrOrTypeDefDoc(AttrOrTypeDef(def), os); +} + //===----------------------------------------------------------------------===// // Dialect Documentation //===----------------------------------------------------------------------===// -static void emitDialectDoc(const Dialect &dialect, ArrayRef ops, - ArrayRef types, ArrayRef typeDefs, - raw_ostream &os) { - os << "# "; - if (dialect.getName().empty()) - os << "Builtin"; - else - os << "'" << dialect.getName() << "'"; - os << " Dialect\n\n"; +static void emitDialectDoc(const Dialect &dialect, ArrayRef attrDefs, + ArrayRef ops, ArrayRef types, + ArrayRef typeDefs, raw_ostream &os) { + os << "# '" << dialect.getName() << "' Dialect\n\n"; emitIfNotEmpty(dialect.getSummary(), os); emitIfNotEmpty(dialect.getDescription(), os); os << "[TOC]\n\n"; + if (!attrDefs.empty()) { + os << "## Attribute definition\n\n"; + for (const AttrDef &def : attrDefs) + emitAttrOrTypeDefDoc(def, os); + } + // TODO: Add link between use and def for types if (!types.empty()) { os << "## Type constraint definition\n\n"; @@ -247,46 +263,68 @@ static void emitDialectDoc(const Dialect &dialect, ArrayRef ops, if (!typeDefs.empty()) { os << "## Type definition\n\n"; - for (const TypeDef &td : typeDefs) - emitTypeDefDoc(td, os); + for (const TypeDef &def : typeDefs) + emitAttrOrTypeDefDoc(def, os); } } static void emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { - const auto &opDefs = recordKeeper.getAllDerivedDefinitions("Op"); - const auto &typeDefs = recordKeeper.getAllDerivedDefinitions("DialectType"); - const auto &typeDefDefs = recordKeeper.getAllDerivedDefinitions("TypeDef"); + std::vector opDefs = recordKeeper.getAllDerivedDefinitions("Op"); + std::vector typeDefs = + recordKeeper.getAllDerivedDefinitions("DialectType"); + std::vector typeDefDefs = + recordKeeper.getAllDerivedDefinitions("TypeDef"); + std::vector attrDefDefs = + recordKeeper.getAllDerivedDefinitions("AttrDef"); std::set dialectsWithDocs; - std::map> dialectOps; - std::map> dialectTypes; - std::map> dialectTypeDefs; + + llvm::StringMap> dialectAttrDefs; + llvm::StringMap> dialectOps; + llvm::StringMap> dialectTypes; + llvm::StringMap> dialectTypeDefs; + for (auto *attrDef : attrDefDefs) { + AttrDef attr(attrDef); + dialectAttrDefs[attr.getDialect().getName()].push_back(attr); + dialectsWithDocs.insert(attr.getDialect()); + } for (auto *opDef : opDefs) { Operator op(opDef); - dialectOps[op.getDialect()].push_back(op); + dialectOps[op.getDialect().getName()].push_back(op); dialectsWithDocs.insert(op.getDialect()); } for (auto *typeDef : typeDefs) { Type type(typeDef); if (auto dialect = type.getDialect()) - dialectTypes[dialect].push_back(type); + dialectTypes[dialect.getName()].push_back(type); } for (auto *typeDef : typeDefDefs) { TypeDef type(typeDef); - dialectTypeDefs[type.getDialect()].push_back(type); + dialectTypeDefs[type.getDialect().getName()].push_back(type); dialectsWithDocs.insert(type.getDialect()); } os << "\n"; - for (auto dialect : dialectsWithDocs) - emitDialectDoc(dialect, dialectOps[dialect], dialectTypes[dialect], - dialectTypeDefs[dialect], os); + for (const Dialect &dialect : dialectsWithDocs) { + StringRef dialectName = dialect.getName(); + emitDialectDoc(dialect, dialectAttrDefs[dialectName], + dialectOps[dialectName], dialectTypes[dialectName], + dialectTypeDefs[dialectName], os); + } } //===----------------------------------------------------------------------===// // Gen Registration //===----------------------------------------------------------------------===// +static mlir::GenRegistration + genAttrRegister("gen-attrdef-doc", + "Generate dialect attribute documentation", + [](const RecordKeeper &records, raw_ostream &os) { + emitAttrOrTypeDefDoc(records, os, "AttrDef"); + return false; + }); + static mlir::GenRegistration genOpRegister("gen-op-doc", "Generate dialect documentation", [](const RecordKeeper &records, raw_ostream &os) { @@ -294,6 +332,13 @@ static mlir::GenRegistration return false; }); +static mlir::GenRegistration + genTypeRegister("gen-typedef-doc", "Generate dialect type documentation", + [](const RecordKeeper &records, raw_ostream &os) { + emitAttrOrTypeDefDoc(records, os, "TypeDef"); + return false; + }); + static mlir::GenRegistration genRegister("gen-dialect-doc", "Generate dialect documentation", [](const RecordKeeper &records, raw_ostream &os) { -- GitLab From 697f90ebfa7c48d61e7ed0d627b91f369b5014bd Mon Sep 17 00:00:00 2001 From: Anshil Gandhi Date: Fri, 19 Mar 2021 19:59:45 -0500 Subject: [PATCH 0224/1000] [NFC] [PowerPC] Determine Endianness in PPCTargetMachine The TargetMachine uses the triple to determine endianness. Just use that logic rather than replicating it in PPCSubtarget. Differential revision: https://reviews.llvm.org/D98674 --- llvm/lib/Target/PowerPC/PPCSubtarget.cpp | 4 +--- llvm/lib/Target/PowerPC/PPCTargetMachine.cpp | 15 +++++++++++++-- llvm/lib/Target/PowerPC/PPCTargetMachine.h | 5 +++++ 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp index bf98ea8a01d0..51c80e14398c 100644 --- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp +++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp @@ -182,9 +182,7 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { StackAlignment = getPlatformStackAlignment(); // Determine endianness. - // FIXME: Part of the TargetMachine. - IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le || - TargetTriple.getArch() == Triple::ppcle); + IsLittleEndian = TM.isLittleEndian(); } bool PPCSubtarget::enableMachineScheduler() const { return true; } diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp index 47fe65640417..32b19d5ddd10 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp @@ -126,13 +126,17 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTarget() { initializeGlobalISel(PR); } +static bool isLittleEndianTriple(const Triple &T) { + return T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle; +} + /// Return the datalayout string of a subtarget. static std::string getDataLayoutString(const Triple &T) { bool is64Bit = T.getArch() == Triple::ppc64 || T.getArch() == Triple::ppc64le; std::string Ret; // Most PPC* platforms are big endian, PPC(64)LE is little endian. - if (T.getArch() == Triple::ppc64le || T.getArch() == Triple::ppcle) + if (isLittleEndianTriple(T)) Ret = "e"; else Ret = "E"; @@ -317,7 +321,8 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT, getEffectiveRelocModel(TT, RM), getEffectivePPCCodeModel(TT, CM, JIT), OL), TLOF(createTLOF(getTargetTriple())), - TargetABI(computeTargetABI(TT, Options)) { + TargetABI(computeTargetABI(TT, Options)), + Endianness(isLittleEndianTriple(TT) ? Endian::LITTLE : Endian::BIG) { initAsmInfo(); } @@ -540,6 +545,12 @@ PPCTargetMachine::getTargetTransformInfo(const Function &F) { return TargetTransformInfo(PPCTTIImpl(this, F)); } +bool PPCTargetMachine::isLittleEndian() const { + assert(Endianness != Endian::NOT_DETECTED && + "Unable to determine endianness"); + return Endianness == Endian::LITTLE; +} + static MachineSchedRegistry PPCPreRASchedRegistry("ppc-prera", "Run PowerPC PreRA specific scheduler", diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.h b/llvm/lib/Target/PowerPC/PPCTargetMachine.h index 21faa4e710e3..ed9e74b72d1e 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetMachine.h +++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.h @@ -25,9 +25,12 @@ namespace llvm { class PPCTargetMachine final : public LLVMTargetMachine { public: enum PPCABI { PPC_ABI_UNKNOWN, PPC_ABI_ELFv1, PPC_ABI_ELFv2 }; + enum Endian { NOT_DETECTED, LITTLE, BIG }; + private: std::unique_ptr TLOF; PPCABI TargetABI; + Endian Endianness = Endian::NOT_DETECTED; mutable StringMap> SubtargetMap; @@ -63,6 +66,8 @@ public: // Addrspacecasts are always noops. return true; } + + bool isLittleEndian() const; }; } // end namespace llvm -- GitLab From b76c09023d9a341353d7bcae1782154d80121838 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Sat, 20 Mar 2021 10:03:26 +0900 Subject: [PATCH 0225/1000] [AMDGPU] Allow index optimisation in SIPreEmitPeephole for bundles Add code so duplication index register changes can be removed from inside bundles. Reviewed By: rampitec, foad Differential Revision: https://reviews.llvm.org/D98940 --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 16 ++- .../CodeGen/AMDGPU/set-gpr-idx-peephole.mir | 110 ++++++++++++++++++ 2 files changed, 121 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 9ca43512cd91..5f10fefa469f 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -219,8 +219,11 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, return false; // Scan back to find an identical S_SET_GPR_IDX_ON - for (MachineBasicBlock::iterator I = std::next(First.getIterator()), - E = MI.getIterator(); I != E; ++I) { + for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()), + E = MI.getIterator(); + I != E; ++I) { + if (I->isBundle()) + continue; switch (I->getOpcode()) { case AMDGPU::S_SET_GPR_IDX_MODE: return false; @@ -249,9 +252,9 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, } } - MI.eraseFromParent(); + MI.eraseFromBundle(); for (MachineInstr *RI : ToRemove) - RI->eraseFromParent(); + RI->eraseFromBundle(); return true; } @@ -315,7 +318,10 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a // second is not needed. Do expensive checks in the optimizeSetGPR() // and limit the distance to 20 instructions for compile time purposes. - for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) { + // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions + // may be bundled with the instructions they modify. + for (MachineBasicBlock::instr_iterator MBBI = MBB.instr_begin(); + MBBI != MBBE;) { MachineInstr &MI = *MBBI; ++MBBI; diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir index c423f757c38d..95f62e5e7cd3 100644 --- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir +++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir @@ -356,3 +356,113 @@ body: | V_MOV_B32_indirect undef $vgpr0, undef $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3(tied-def 3) S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode ... + +--- +name: simple_bundle +body: | + bb.0: + ; GCN-LABEL: name: simple_bundle + ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + ; GCN: $vgpr16 = V_MOV_B32_e32 undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + ; GCN: } + ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + ; GCN: $vgpr15 = V_MOV_B32_e32 undef $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + ; GCN: } + BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + $vgpr16 = V_MOV_B32_e32 undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + } + BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + $vgpr15 = V_MOV_B32_e32 undef $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + } +... + +--- +name: salu_in_between_bundle +body: | + bb.0: + ; GCN-LABEL: name: salu_in_between_bundle + ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + ; GCN: $vgpr16 = V_MOV_B32_e32 undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + ; GCN: } + ; GCN: $sgpr0 = S_MOV_B32 $sgpr2 + ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + ; GCN: $vgpr15 = V_MOV_B32_e32 undef $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + ; GCN: } + BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + $vgpr16 = V_MOV_B32_e32 undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + } + $sgpr0 = S_MOV_B32 $sgpr2 + BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + $vgpr15 = V_MOV_B32_e32 undef $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + } +... + +--- +name: valu_in_between_bundle +body: | + bb.0: + ; GCN-LABEL: name: valu_in_between_bundle + ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + ; GCN: $vgpr16 = V_MOV_B32_e32 undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + ; GCN: } + ; GCN: $vgpr20 = V_MOV_B32_e32 1, implicit $exec + ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + ; GCN: $vgpr15 = V_MOV_B32_e32 undef $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + ; GCN: } + BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + $vgpr16 = V_MOV_B32_e32 undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + } + $vgpr20 = V_MOV_B32_e32 1, implicit $exec + BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + $vgpr15 = V_MOV_B32_e32 undef $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + } +... + +--- +name: changed_index_bundle +body: | + bb.0: + ; GCN-LABEL: name: changed_index_bundle + ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + ; GCN: $vgpr16 = V_MOV_B32_e32 undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + ; GCN: } + ; GCN: $sgpr2 = S_MOV_B32 1 + ; GCN: BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + ; GCN: S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + ; GCN: $vgpr15 = V_MOV_B32_e32 undef $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + ; GCN: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + ; GCN: } + BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr16, implicit-def $vgpr16_lo16, implicit-def $vgpr16_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + $vgpr16 = V_MOV_B32_e32 undef $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + } + $sgpr2 = S_MOV_B32 1 + BUNDLE implicit-def $m0, implicit-def $m0_lo16, implicit-def $m0_hi16, implicit-def $mode, implicit-def $vgpr15, implicit-def $vgpr15_lo16, implicit-def $vgpr15_hi16, implicit $sgpr2, implicit $m0, implicit $mode, implicit undef $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 { + S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $mode, implicit-def $m0, implicit $mode, implicit undef $m0 + $vgpr15 = V_MOV_B32_e32 undef $vgpr0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $m0 + S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode + } +... -- GitLab From d9343e61534f54665b2be6dd8bc2e051220d3beb Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Fri, 19 Mar 2021 15:43:42 -0700 Subject: [PATCH 0226/1000] [mlir][python] Function decorator for capturing a FuncOp from a python function. * Moves this out of a test case where it was being developed to good effect and generalizes it. * Having tried a number of things like this, I think this balances concerns reasonably well. Differential Revision: https://reviews.llvm.org/D98989 --- .../Python/mlir/dialects/_builtin_ops_ext.py | 101 ++++++++++++++++++ mlir/test/Bindings/Python/dialects/builtin.py | 100 ++++++++++++++++- .../linalg/opdsl/emit_structured_generic.py | 80 ++++---------- 3 files changed, 218 insertions(+), 63 deletions(-) diff --git a/mlir/lib/Bindings/Python/mlir/dialects/_builtin_ops_ext.py b/mlir/lib/Bindings/Python/mlir/dialects/_builtin_ops_ext.py index b0789299139d..dc1d37e766d0 100644 --- a/mlir/lib/Bindings/Python/mlir/dialects/_builtin_ops_ext.py +++ b/mlir/lib/Bindings/Python/mlir/dialects/_builtin_ops_ext.py @@ -1,6 +1,11 @@ # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from typing import Optional, Sequence + +import inspect + from ..ir import * @@ -93,3 +98,99 @@ class FuncOp: raise IndexError('The function already has an entry block!') self.body.blocks.append(*self.type.inputs) return self.body.blocks[0] + + @classmethod + def from_py_func(FuncOp, + *inputs: Type, + results: Optional[Sequence[Type]] = None, + name: Optional[str] = None): + """Decorator to define an MLIR FuncOp specified as a python function. + + Requires that an `mlir.ir.InsertionPoint` and `mlir.ir.Location` are + active for the current thread (i.e. established in a `with` block). + + When applied as a decorator to a Python function, an entry block will + be constructed for the FuncOp with types as specified in `*inputs`. The + block arguments will be passed positionally to the Python function. In + addition, if the Python function accepts keyword arguments generally or + has a corresponding keyword argument, the following will be passed: + * `func_op`: The `func` op being defined. + + By default, the function name will be the Python function `__name__`. This + can be overriden by passing the `name` argument to the decorator. + + If `results` is not specified, then the decorator will implicitly + insert a `ReturnOp` with the `Value`'s returned from the decorated + function. It will also set the `FuncOp` type with the actual return + value types. If `results` is specified, then the decorated function + must return `None` and no implicit `ReturnOp` is added (nor are the result + types updated). The implicit behavior is intended for simple, single-block + cases, and users should specify result types explicitly for any complicated + cases. + + The decorated function can further be called from Python and will insert + a `CallOp` at the then-current insertion point, returning either None ( + if no return values), a unary Value (for one result), or a list of Values). + This mechanism cannot be used to emit recursive calls (by construction). + """ + + def decorator(f): + from . import std + # Introspect the callable for optional features. + sig = inspect.signature(f) + has_arg_func_op = False + for param in sig.parameters.values(): + if param.kind == param.VAR_KEYWORD: + has_arg_func_op = True + if param.name == "func_op" and (param.kind + == param.POSITIONAL_OR_KEYWORD or + param.kind == param.KEYWORD_ONLY): + has_arg_func_op = True + + # Emit the FuncOp. + implicit_return = results is None + symbol_name = name or f.__name__ + function_type = FunctionType.get( + inputs=inputs, results=[] if implicit_return else results) + func_op = FuncOp(name=symbol_name, type=function_type) + with InsertionPoint(func_op.add_entry_block()): + func_args = func_op.entry_block.arguments + func_kwargs = {} + if has_arg_func_op: + func_kwargs["func_op"] = func_op + return_values = f(*func_args, **func_kwargs) + if not implicit_return: + return_types = list(results) + assert return_values is None, ( + "Capturing a python function with explicit `results=` " + "requires that the wrapped function returns None.") + else: + # Coerce return values, add ReturnOp and rewrite func type. + if return_values is None: + return_values = [] + elif isinstance(return_values, Value): + return_values = [return_values] + else: + return_values = list(return_values) + std.ReturnOp(return_values) + # Recompute the function type. + return_types = [v.type for v in return_values] + function_type = FunctionType.get(inputs=inputs, results=return_types) + func_op.attributes["type"] = TypeAttr.get(function_type) + + def emit_call_op(*call_args): + call_op = std.CallOp(return_types, FlatSymbolRefAttr.get(symbol_name), + call_args) + if return_types is None: + return None + elif len(return_types) == 1: + return call_op.result + else: + return call_op.results + + wrapped = emit_call_op + wrapped.__name__ = f.__name__ + wrapped.func_op = func_op + return wrapped + + return decorator diff --git a/mlir/test/Bindings/Python/dialects/builtin.py b/mlir/test/Bindings/Python/dialects/builtin.py index 447a255f6021..80dea68bae36 100644 --- a/mlir/test/Bindings/Python/dialects/builtin.py +++ b/mlir/test/Bindings/Python/dialects/builtin.py @@ -8,9 +8,106 @@ import mlir.dialects.std as std def run(f): print("\nTEST:", f.__name__) f() + return f + + +# CHECK-LABEL: TEST: testFromPyFunc +@run +def testFromPyFunc(): + with Context() as ctx, Location.unknown() as loc: + m = builtin.ModuleOp() + f32 = F32Type.get() + f64 = F64Type.get() + with InsertionPoint.at_block_terminator(m.body): + # CHECK-LABEL: func @unary_return(%arg0: f64) -> f64 + # CHECK: return %arg0 : f64 + @builtin.FuncOp.from_py_func(f64) + def unary_return(a): + return a + + # CHECK-LABEL: func @binary_return(%arg0: f32, %arg1: f64) -> (f32, f64) + # CHECK: return %arg0, %arg1 : f32, f64 + @builtin.FuncOp.from_py_func(f32, f64) + def binary_return(a, b): + return a, b + + # CHECK-LABEL: func @none_return(%arg0: f32, %arg1: f64) + # CHECK: return + @builtin.FuncOp.from_py_func(f32, f64) + def none_return(a, b): + pass + + # CHECK-LABEL: func @call_unary + # CHECK: %0 = call @unary_return(%arg0) : (f64) -> f64 + # CHECK: return %0 : f64 + @builtin.FuncOp.from_py_func(f64) + def call_unary(a): + return unary_return(a) + + # CHECK-LABEL: func @call_binary + # CHECK: %0:2 = call @binary_return(%arg0, %arg1) : (f32, f64) -> (f32, f64) + # CHECK: return %0#0, %0#1 : f32, f64 + @builtin.FuncOp.from_py_func(f32, f64) + def call_binary(a, b): + return binary_return(a, b) + + # CHECK-LABEL: func @call_none + # CHECK: call @none_return(%arg0, %arg1) : (f32, f64) -> () + # CHECK: return + @builtin.FuncOp.from_py_func(f32, f64) + def call_none(a, b): + return none_return(a, b) + + ## Variants and optional feature tests. + # CHECK-LABEL: func @from_name_arg + @builtin.FuncOp.from_py_func(f32, f64, name="from_name_arg") + def explicit_name(a, b): + return b + + @builtin.FuncOp.from_py_func(f32, f64) + def positional_func_op(a, b, func_op): + assert isinstance(func_op, builtin.FuncOp) + return b + + @builtin.FuncOp.from_py_func(f32, f64) + def kw_func_op(a, b=None, func_op=None): + assert isinstance(func_op, builtin.FuncOp) + return b + + @builtin.FuncOp.from_py_func(f32, f64) + def kwargs_func_op(a, b=None, **kwargs): + assert isinstance(kwargs["func_op"], builtin.FuncOp) + return b + + # CHECK-LABEL: func @explicit_results(%arg0: f32, %arg1: f64) -> f64 + # CHECK: return %arg1 : f64 + @builtin.FuncOp.from_py_func(f32, f64, results=[f64]) + def explicit_results(a, b): + std.ReturnOp([b]) + + print(m) + + +# CHECK-LABEL: TEST: testFromPyFuncErrors +@run +def testFromPyFuncErrors(): + with Context() as ctx, Location.unknown() as loc: + m = builtin.ModuleOp() + f32 = F32Type.get() + f64 = F64Type.get() + with InsertionPoint.at_block_terminator(m.body): + try: + + @builtin.FuncOp.from_py_func(f64, results=[f64]) + def unary_return(a): + return a + except AssertionError as e: + # CHECK: Capturing a python function with explicit `results=` requires that the wrapped function returns None. + print(e) # CHECK-LABEL: TEST: testBuildFuncOp +@run def testBuildFuncOp(): ctx = Context() with Location.unknown(ctx) as loc: @@ -64,6 +161,3 @@ def testBuildFuncOp(): # CHECK: return %arg0 : tensor<2x3x4xf32> # CHECK: } print(m) - - -run(testBuildFuncOp) diff --git a/mlir/test/Bindings/Python/dialects/linalg/opdsl/emit_structured_generic.py b/mlir/test/Bindings/Python/dialects/linalg/opdsl/emit_structured_generic.py index 7f8c11679457..573999c97525 100644 --- a/mlir/test/Bindings/Python/dialects/linalg/opdsl/emit_structured_generic.py +++ b/mlir/test/Bindings/Python/dialects/linalg/opdsl/emit_structured_generic.py @@ -10,46 +10,6 @@ from mlir.dialects import std from mlir.dialects.linalg.opdsl.lang import * -# TODO: Find a home for this quality of life helper. -def build_function(*inputs: Type, results: Optional[Sequence[Type]] = None): - """Decorator that emits a function in a more pythonic way. - - If result types are not specified, they are inferred from the function - returns. The `ReturnOp` is implicitly added upon the wrapped function return. - """ - - def decorator(f): - return_types = results - symbol_name = f.__name__ - function_type = FunctionType.get(inputs=inputs, results=results or []) - func_op = builtin.FuncOp(name=symbol_name, type=function_type) - with InsertionPoint(func_op.add_entry_block()): - func_args = func_op.entry_block.arguments - return_values = f(*func_args) - if return_values is None: - return_values = [] - elif isinstance(return_values, Value): - return_values = [return_values] - else: - return_values = list(return_values) - std.ReturnOp(return_values) - if return_types is None: - # Recompute the function type. - return_types = [v.type for v in return_values] - function_type = FunctionType.get(inputs=inputs, results=return_types) - # TODO: Have an API or a setter for this. - func_op.attributes["type"] = TypeAttr.get(function_type) - - # TODO: When turning this into a real facility, return a function that emits - # a `call` to the function instead of doing nothing. - wrapped = lambda: None - wrapped.__name__ = symbol_name - wrapped.func_op = func_op - return wrapped - - return decorator - - @linalg_structured_op def matmul_mono(A=TensorDef(T, S.M, S.K), B=TensorDef(T, S.K, S.N), @@ -92,8 +52,8 @@ with Context() as ctx, Location.unknown(): # CHECK-SAME: ins(%[[A]], %[[B]] # CHECK-SAME: outs(%[[INITC]] - @build_function(RankedTensorType.get((4, 16), f32), - RankedTensorType.get((16, 8), f32)) + @builtin.FuncOp.from_py_func(RankedTensorType.get((4, 16), f32), + RankedTensorType.get((16, 8), f32)) def test_matmul_mono(lhs, rhs): # TODO: Enable outs inference and add sugar for InitTensorOp # construction. @@ -114,9 +74,9 @@ with Context() as ctx, Location.unknown(): # CHECK-NEXT: %[[ADD:.+]] = addi %[[C_ARG]], %[[MUL]] : i32 # CHECK-NEXT: linalg.yield %[[ADD]] : i32 # CHECK-NEXT: -> tensor<4x8xi32> - @build_function(RankedTensorType.get((4, 16), i8), - RankedTensorType.get((16, 8), i8), - RankedTensorType.get((4, 8), i32)) + @builtin.FuncOp.from_py_func(RankedTensorType.get((4, 16), i8), + RankedTensorType.get((16, 8), i8), + RankedTensorType.get((4, 8), i32)) def test_i8i8i32_matmul(lhs, rhs, init_result): return matmul_poly(lhs, rhs, outs=[init_result]) @@ -128,9 +88,9 @@ with Context() as ctx, Location.unknown(): # CHECK-NEXT: %[[ADD:.+]] = addi %[[C_ARG]], %[[MUL]] : i32 # CHECK-NEXT: linalg.yield %[[ADD]] : i32 # CHECK-NEXT: -> tensor<4x8xi32> - @build_function(RankedTensorType.get((4, 16), i8), - RankedTensorType.get((16, 8), i16), - RankedTensorType.get((4, 8), i32)) + @builtin.FuncOp.from_py_func(RankedTensorType.get((4, 16), i8), + RankedTensorType.get((16, 8), i16), + RankedTensorType.get((4, 8), i32)) def test_i8i16i32_matmul(lhs, rhs, init_result): return matmul_poly(lhs, rhs, outs=[init_result]) @@ -142,9 +102,9 @@ with Context() as ctx, Location.unknown(): # CHECK-NEXT: %[[ADD:.+]] = addi %[[C_ARG]], %[[MUL]] : i16 # CHECK-NEXT: linalg.yield %[[ADD]] : i16 # CHECK-NEXT: -> tensor<4x8xi16> - @build_function(RankedTensorType.get((4, 16), i32), - RankedTensorType.get((16, 8), i32), - RankedTensorType.get((4, 8), i16)) + @builtin.FuncOp.from_py_func(RankedTensorType.get((4, 16), i32), + RankedTensorType.get((16, 8), i32), + RankedTensorType.get((4, 8), i16)) def test_i32i32i16_matmul(lhs, rhs, init_result): return matmul_poly(lhs, rhs, outs=[init_result]) @@ -156,9 +116,9 @@ with Context() as ctx, Location.unknown(): # CHECK-NEXT: %[[ADD:.+]] = addf %[[C_ARG]], %[[MUL]] : f32 # CHECK-NEXT: linalg.yield %[[ADD]] : f32 # CHECK-NEXT: -> tensor<4x8xf32> - @build_function(RankedTensorType.get((4, 16), i8), - RankedTensorType.get((16, 8), i8), - RankedTensorType.get((4, 8), f32)) + @builtin.FuncOp.from_py_func(RankedTensorType.get((4, 16), i8), + RankedTensorType.get((16, 8), i8), + RankedTensorType.get((4, 8), f32)) def test_i8i8f32_matmul(lhs, rhs, init_result): return matmul_poly(lhs, rhs, outs=[init_result]) @@ -170,9 +130,9 @@ with Context() as ctx, Location.unknown(): # CHECK-NEXT: %[[ADD:.+]] = addf %[[C_ARG]], %[[MUL]] : f32 # CHECK-NEXT: linalg.yield %[[ADD]] : f32 # CHECK-NEXT: -> tensor<4x8xf32> - @build_function(RankedTensorType.get((4, 16), f16), - RankedTensorType.get((16, 8), f16), - RankedTensorType.get((4, 8), f32)) + @builtin.FuncOp.from_py_func(RankedTensorType.get((4, 16), f16), + RankedTensorType.get((16, 8), f16), + RankedTensorType.get((4, 8), f32)) def test_f16f16f32_matmul(lhs, rhs, init_result): return matmul_poly(lhs, rhs, outs=[init_result]) @@ -184,9 +144,9 @@ with Context() as ctx, Location.unknown(): # CHECK-NEXT: %[[ADD:.+]] = addf %[[C_ARG]], %[[MUL]] : f32 # CHECK-NEXT: linalg.yield %[[ADD]] : f32 # CHECK-NEXT: -> tensor<4x8xf32> - @build_function(RankedTensorType.get((4, 16), f64), - RankedTensorType.get((16, 8), f64), - RankedTensorType.get((4, 8), f32)) + @builtin.FuncOp.from_py_func(RankedTensorType.get((4, 16), f64), + RankedTensorType.get((16, 8), f64), + RankedTensorType.get((4, 8), f32)) def test_f64f64f32_matmul(lhs, rhs, init_result): return matmul_poly(lhs, rhs, outs=[init_result]) -- GitLab From f3800664611976e4ccae234d8881a65725358260 Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 19 Mar 2021 17:31:29 -0700 Subject: [PATCH 0227/1000] [JITLink] Remove redundant local variable definitions from a unit test. --- .../llvm/ExecutionEngine/JITLink/JITLink.h | 28 ++++++ .../JITLink/LinkGraphTests.cpp | 95 +++++++++++++++++-- 2 files changed, 117 insertions(+), 6 deletions(-) diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h index 799284d38cb7..24c0a75ac53f 100644 --- a/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h +++ b/llvm/include/llvm/ExecutionEngine/JITLink/JITLink.h @@ -479,6 +479,16 @@ public: /// Returns the size of this symbol. JITTargetAddress getSize() const { return Size; } + /// Set the size of this symbol. + void setSize(JITTargetAddress Size) { + assert(Base && "Cannot set size for null Symbol"); + assert((Size == 0 || Base->isDefined()) && + "Non-zero size can only be set for defined symbols"); + assert((Offset + Size <= static_cast(*Base).getSize()) && + "Symbol size cannot extend past the end of its containing block"); + this->Size = Size; + } + /// Returns true if this symbol is backed by a zero-fill block. /// This method may only be called on defined symbols. bool isSymbolZeroFill() const { return getBlock().isZeroFill(); } @@ -1014,6 +1024,24 @@ public: ExternalSymbols.insert(&Sym); } + /// Turn an external symbol into a defined one by attaching it to a block. + void makeDefined(Symbol &Sym, Block &Content, JITTargetAddress Offset, + JITTargetAddress Size, Linkage L, Scope S, bool IsLive) { + assert(!Sym.isDefined() && !Sym.isAbsolute() && + "Sym is not an external symbol"); + assert(ExternalSymbols.count(&Sym) && "Symbol is not in the externals set"); + ExternalSymbols.erase(&Sym); + Addressable &OldBase = *Sym.Base; + Sym.setBlock(Content); + Sym.setOffset(Offset); + Sym.setSize(Size); + Sym.setLinkage(L); + Sym.setScope(S); + Sym.setLive(IsLive); + Content.getSection().addSymbol(Sym); + destroyAddressable(OldBase); + } + /// Removes an external symbol. Also removes the underlying Addressable. void removeExternalSymbol(Symbol &Sym) { assert(!Sym.isDefined() && !Sym.isAbsolute() && diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp index 810a2fd0e1f3..6e00550cf242 100644 --- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp +++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp @@ -101,14 +101,97 @@ TEST(LinkGraphTest, BlockAndSymbolIteration) { EXPECT_TRUE(llvm::count(G.defined_symbols(), &S4)); } -TEST(LinkGraphTest, SplitBlock) { - // Check that the LinkGraph::splitBlock test works as expected. +TEST(LinkGraphTest, MakeExternal) { + // Check that we can make a defined symbol external. + LinkGraph G("foo", Triple("x86_64-apple-darwin"), 8, support::little, + getGenericEdgeKindName); + auto &Sec = G.createSection("__data", RWFlags); + + // Create an initial block. + auto &B1 = G.createContentBlock(Sec, BlockContent, 0x1000, 8, 0); + + // Add a symbol to the block. + auto &S1 = G.addDefinedSymbol(B1, 0, "S1", 4, Linkage::Strong, Scope::Default, + false, false); + + EXPECT_TRUE(S1.isDefined()) << "Symbol should be defined"; + EXPECT_FALSE(S1.isExternal()) << "Symbol should not be external"; + EXPECT_FALSE(S1.isAbsolute()) << "Symbol should not be absolute"; + EXPECT_TRUE(&S1.getBlock()) << "Symbol should have a non-null block"; + EXPECT_EQ(S1.getAddress(), 0x1000U) << "Unexpected symbol address"; + + EXPECT_EQ( + std::distance(G.defined_symbols().begin(), G.defined_symbols().end()), 1U) + << "Unexpected number of defined symbols"; + EXPECT_EQ( + std::distance(G.external_symbols().begin(), G.external_symbols().end()), + 0U) + << "Unexpected number of external symbols"; + + // Make S1 external, confirm that the its flags are updated and that it is + // moved from the defined symbols to the externals list. + G.makeExternal(S1); + + EXPECT_FALSE(S1.isDefined()) << "Symbol should not be defined"; + EXPECT_TRUE(S1.isExternal()) << "Symbol should be external"; + EXPECT_FALSE(S1.isAbsolute()) << "Symbol should not be absolute"; + EXPECT_EQ(S1.getAddress(), 0U) << "Unexpected symbol address"; + + EXPECT_EQ( + std::distance(G.defined_symbols().begin(), G.defined_symbols().end()), 0U) + << "Unexpected number of defined symbols"; + EXPECT_EQ( + std::distance(G.external_symbols().begin(), G.external_symbols().end()), + 1U) + << "Unexpected number of external symbols"; +} + +TEST(LinkGraphTest, MakeDefined) { + // Check that we can make an external symbol defined. + LinkGraph G("foo", Triple("x86_64-apple-darwin"), 8, support::little, + getGenericEdgeKindName); + auto &Sec = G.createSection("__data", RWFlags); - const char BlockContentBytes[] = {0x10, 0x11, 0x12, 0x13, 0x14, 0x15, - 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, - 0x1C, 0x1D, 0x1E, 0x1F, 0x00}; - StringRef BlockContent(BlockContentBytes); + // Create an initial block. + auto &B1 = G.createContentBlock(Sec, BlockContent, 0x1000, 8, 0); + + // Add an external symbol. + auto &S1 = G.addExternalSymbol("S1", 4, Linkage::Strong); + + EXPECT_FALSE(S1.isDefined()) << "Symbol should not be defined"; + EXPECT_TRUE(S1.isExternal()) << "Symbol should be external"; + EXPECT_FALSE(S1.isAbsolute()) << "Symbol should not be absolute"; + EXPECT_EQ(S1.getAddress(), 0U) << "Unexpected symbol address"; + EXPECT_EQ( + std::distance(G.defined_symbols().begin(), G.defined_symbols().end()), 0U) + << "Unexpected number of defined symbols"; + EXPECT_EQ( + std::distance(G.external_symbols().begin(), G.external_symbols().end()), + 1U) + << "Unexpected number of external symbols"; + + // Make S1 defined, confirm that its flags are updated and that it is + // moved from the defined symbols to the externals list. + G.makeDefined(S1, B1, 0, 4, Linkage::Strong, Scope::Default, false); + + EXPECT_TRUE(S1.isDefined()) << "Symbol should be defined"; + EXPECT_FALSE(S1.isExternal()) << "Symbol should not be external"; + EXPECT_FALSE(S1.isAbsolute()) << "Symbol should not be absolute"; + EXPECT_TRUE(&S1.getBlock()) << "Symbol should have a non-null block"; + EXPECT_EQ(S1.getAddress(), 0x1000U) << "Unexpected symbol address"; + + EXPECT_EQ( + std::distance(G.defined_symbols().begin(), G.defined_symbols().end()), 1U) + << "Unexpected number of defined symbols"; + EXPECT_EQ( + std::distance(G.external_symbols().begin(), G.external_symbols().end()), + 0U) + << "Unexpected number of external symbols"; +} + +TEST(LinkGraphTest, SplitBlock) { + // Check that the LinkGraph::splitBlock test works as expected. LinkGraph G("foo", Triple("x86_64-apple-darwin"), 8, support::little, getGenericEdgeKindName); auto &Sec = G.createSection("__data", RWFlags); -- GitLab From 8d05a28887ee1e3cbcddf892de8bbc560432afd2 Mon Sep 17 00:00:00 2001 From: Stella Laurenzo Date: Fri, 19 Mar 2021 18:44:51 -0700 Subject: [PATCH 0228/1000] [mlir][python] Adapt to `segment_sizes` attribute type change. * Broken by https://reviews.llvm.org/rG1a75be0023cd80fd8560d689999a63d4368c90e6 --- mlir/lib/Bindings/Python/IRCore.cpp | 12 ++++++------ mlir/test/Bindings/Python/ods_helpers.py | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index 9d87aa52f7c8..0a4c5fcb40c3 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -1034,8 +1034,8 @@ PyOpView::buildGeneric(py::object cls, py::list resultTypeList, py::object operandSegmentSpecObj = cls.attr("_ODS_OPERAND_SEGMENTS"); py::object resultSegmentSpecObj = cls.attr("_ODS_RESULT_SEGMENTS"); - std::vector operandSegmentLengths; - std::vector resultSegmentLengths; + std::vector operandSegmentLengths; + std::vector resultSegmentLengths; // Validate/determine region count. auto opRegionSpec = py::cast>(cls.attr("_ODS_REGIONS")); @@ -1247,8 +1247,8 @@ PyOpView::buildGeneric(py::object cls, py::list resultTypeList, // Add result_segment_sizes attribute. if (!resultSegmentLengths.empty()) { int64_t size = resultSegmentLengths.size(); - MlirAttribute segmentLengthAttr = mlirDenseElementsAttrUInt64Get( - mlirVectorTypeGet(1, &size, mlirIntegerTypeGet(context->get(), 64)), + MlirAttribute segmentLengthAttr = mlirDenseElementsAttrUInt32Get( + mlirVectorTypeGet(1, &size, mlirIntegerTypeGet(context->get(), 32)), resultSegmentLengths.size(), resultSegmentLengths.data()); (*attributes)["result_segment_sizes"] = PyAttribute(context, segmentLengthAttr); @@ -1257,8 +1257,8 @@ PyOpView::buildGeneric(py::object cls, py::list resultTypeList, // Add operand_segment_sizes attribute. if (!operandSegmentLengths.empty()) { int64_t size = operandSegmentLengths.size(); - MlirAttribute segmentLengthAttr = mlirDenseElementsAttrUInt64Get( - mlirVectorTypeGet(1, &size, mlirIntegerTypeGet(context->get(), 64)), + MlirAttribute segmentLengthAttr = mlirDenseElementsAttrUInt32Get( + mlirVectorTypeGet(1, &size, mlirIntegerTypeGet(context->get(), 32)), operandSegmentLengths.size(), operandSegmentLengths.data()); (*attributes)["operand_segment_sizes"] = PyAttribute(context, segmentLengthAttr); diff --git a/mlir/test/Bindings/Python/ods_helpers.py b/mlir/test/Bindings/Python/ods_helpers.py index 54f68a82fc01..badeac37034f 100644 --- a/mlir/test/Bindings/Python/ods_helpers.py +++ b/mlir/test/Bindings/Python/ods_helpers.py @@ -125,8 +125,8 @@ def testOdsBuildDefaultSizedVariadic(): # CHECK: %[[V2:.+]] = "custom.value" # CHECK: %[[V3:.+]] = "custom.value" # CHECK: "custom.test_op"(%[[V0]], %[[V1]], %[[V2]], %[[V3]]) - # CHECK-SAME: operand_segment_sizes = dense<[1, 2, 1]> : vector<3xi64> - # CHECK-SAME: result_segment_sizes = dense<[2, 1, 1]> : vector<3xi64> + # CHECK-SAME: operand_segment_sizes = dense<[1, 2, 1]> : vector<3xi32> + # CHECK-SAME: result_segment_sizes = dense<[2, 1, 1]> : vector<3xi32> # CHECK-SAME: : (i32, i32, i32, i32) -> (i8, i16, i32, i64) op = TestOp.build_generic( results=[[t0, t1], t2, t3], -- GitLab From bcb34a538729f7c5b49aff9535196e239495db85 Mon Sep 17 00:00:00 2001 From: Senran Zhang Date: Fri, 19 Mar 2021 19:09:11 -0700 Subject: [PATCH 0229/1000] [Utils][vim] Highlight `poison` keyword Reviewed By: awarzynski, MaskRay Differential Revision: https://reviews.llvm.org/D98927 --- llvm/utils/vim/syntax/llvm.vim | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim index 48d3daf581ca..70918ffd160c 100644 --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -211,7 +211,7 @@ syn match llvmNumber /-\?\<\d\+\>/ syn match llvmFloat /-\?\<\d\+\.\d*\(e[+-]\d\+\)\?\>/ syn match llvmFloat /\<0x\x\+\>/ syn keyword llvmBoolean true false -syn keyword llvmConstant zeroinitializer undef null none +syn keyword llvmConstant zeroinitializer undef null none poison syn match llvmComment /;.*$/ syn region llvmString start=/"/ skip=/\\"/ end=/"/ syn match llvmLabel /[-a-zA-Z$._][-a-zA-Z$._0-9]*:/ -- GitLab From 8bc3685883cf735746d2cc1f232922a643b93c9a Mon Sep 17 00:00:00 2001 From: Lang Hames Date: Fri, 19 Mar 2021 19:13:50 -0700 Subject: [PATCH 0230/1000] [llvm-jitlink] Scan input files for first object to determine triple. The previous logic would crash if the first input file was an archive rather than an object. --- llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp index 473afc5f47e7..24e934e20306 100644 --- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp +++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp @@ -1008,11 +1008,22 @@ Session::findSymbolInfo(StringRef SymbolName, Twine ErrorMsgStem) { static Triple getFirstFileTriple() { static Triple FirstTT = []() { assert(!InputFiles.empty() && "InputFiles can not be empty"); - auto ObjBuffer = - ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(InputFiles.front()))); - auto Obj = ExitOnErr( - object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef())); - return Obj->makeTriple(); + for (auto InputFile : InputFiles) { + auto ObjBuffer = + ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(InputFile))); + switch (identify_magic(ObjBuffer->getBuffer())) { + case file_magic::elf_relocatable: + case file_magic::macho_object: + case file_magic::coff_object: { + auto Obj = ExitOnErr( + object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef())); + return Obj->makeTriple(); + } + default: + break; + } + } + return Triple(); }(); return FirstTT; -- GitLab From 5df2af8b0ef33f48b1ee72bcd27bc609b898da52 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Sat, 20 Mar 2021 10:29:08 +0900 Subject: [PATCH 0231/1000] [AMDGPU] Merge SIRemoveShortExecBranches into SIPreEmitPeephole SIRemoveShortExecBranches is an optimisation so fits well in the context of SIPreEmitPeephole. Test changes relate to early termination from kills which have now been lowered prior to considering branches for removal. As these use s_cbranch the execz skips are now retained instead. Currently either behaviour is valid as kill with EXEC=0 is a nop; however, if early termination is used differently in future then the new behaviour is the correct one. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D98917 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 3 - .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 - llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 - llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 90 +++++++++- .../AMDGPU/SIRemoveShortExecBranches.cpp | 159 ------------------ .../GlobalISel/llvm.amdgcn.wqm.demote.ll | 72 +++++--- .../AMDGPU/insert-skips-flat-vmem-ds.mir | 2 +- llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir | 2 +- .../AMDGPU/insert-skips-ignored-insts.mir | 2 +- .../CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll | 72 +++++--- ...emove-short-exec-branches-gpr-idx-mode.mir | 2 +- ...ort-exec-branches-special-instructions.mir | 2 +- .../AMDGPU/skip-branch-taildup-ret.mir | 2 +- llvm/test/CodeGen/AMDGPU/skip-if-dead.ll | 18 +- .../transform-block-with-return-to-epilog.ll | 15 +- .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 - 16 files changed, 211 insertions(+), 234 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index cdd59fe0b847..4f9f888506b7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -204,9 +204,6 @@ extern char &SIWholeQuadModeID; void initializeSILowerControlFlowPass(PassRegistry &); extern char &SILowerControlFlowID; -void initializeSIRemoveShortExecBranchesPass(PassRegistry &); -extern char &SIRemoveShortExecBranchesID; - void initializeSIPreEmitPeepholePass(PassRegistry &); extern char &SIPreEmitPeepholeID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9db4e8c8472f..2b42f9e1281e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -249,7 +249,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIModeRegisterPass(*PR); initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); - initializeSIRemoveShortExecBranchesPass(*PR); initializeSIPreEmitPeepholePass(*PR); initializeSIInsertSkipsPass(*PR); initializeSIMemoryLegalizerPass(*PR); @@ -1215,7 +1214,6 @@ void GCNPassConfig::addPreEmitPass() { if (getOptLevel() > CodeGenOpt::None) addPass(&SIInsertHardClausesID); - addPass(&SIRemoveShortExecBranchesID); addPass(&SIInsertSkipsPassID); addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 7aa256821167..03b0c0f45f2d 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -137,7 +137,6 @@ add_llvm_target(AMDGPUCodeGen SIPreEmitPeephole.cpp SIProgramInfo.cpp SIRegisterInfo.cpp - SIRemoveShortExecBranches.cpp SIShrinkInstructions.cpp SIWholeQuadMode.cpp GCNILPSched.cpp diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 5f10fefa469f..93d33fddff52 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -21,6 +21,14 @@ using namespace llvm; #define DEBUG_TYPE "si-pre-emit-peephole" +static unsigned SkipThreshold; + +static cl::opt SkipThresholdFlag( + "amdgpu-skip-threshold", cl::Hidden, + cl::desc( + "Number of instructions before jumping over divergent control flow"), + cl::location(SkipThreshold), cl::init(12)); + namespace { class SIPreEmitPeephole : public MachineFunctionPass { @@ -30,6 +38,13 @@ private: bool optimizeVccBranch(MachineInstr &MI) const; bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const; + bool getBlockDestinations(MachineBasicBlock &SrcMBB, + MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, + SmallVectorImpl &Cond); + bool mustRetainExeczBranch(const MachineBasicBlock &From, + const MachineBasicBlock &To) const; + bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); public: static char ID; @@ -258,6 +273,74 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First, return true; } +bool SIPreEmitPeephole::getBlockDestinations( + MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, + MachineBasicBlock *&FalseMBB, SmallVectorImpl &Cond) { + if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + if (!FalseMBB) + FalseMBB = SrcMBB.getNextNode(); + + return true; +} + +bool SIPreEmitPeephole::mustRetainExeczBranch( + const MachineBasicBlock &From, const MachineBasicBlock &To) const { + unsigned NumInstr = 0; + const MachineFunction *MF = From.getParent(); + + for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); + MBBI != End && MBBI != ToI; ++MBBI) { + const MachineBasicBlock &MBB = *MBBI; + + for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); + I != E; ++I) { + // When a uniform loop is inside non-uniform control flow, the branch + // leaving the loop might never be taken when EXEC = 0. + // Hence we should retain cbranch out of the loop lest it become infinite. + if (I->isConditionalBranch()) + return true; + + if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) + return true; + + // These instructions are potentially expensive even if EXEC = 0. + if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || + TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) + return true; + + ++NumInstr; + if (NumInstr >= SkipThreshold) + return true; + } + } + + return false; +} + +// Returns true if the skip branch instruction is removed. +bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, + MachineBasicBlock &SrcMBB) { + MachineBasicBlock *TrueMBB = nullptr; + MachineBasicBlock *FalseMBB = nullptr; + SmallVector Cond; + + if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) + return false; + + // Consider only the forward branches. + if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || + mustRetainExeczBranch(*FalseMBB, *TrueMBB)) + return false; + + LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); + MI.eraseFromParent(); + SrcMBB.removeSuccessor(TrueMBB); + + return true; +} + bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -265,10 +348,12 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { MachineBasicBlock *EmptyMBBAtEnd = nullptr; bool Changed = false; + MF.RenumberBlocks(); + for (MachineBasicBlock &MBB : MF) { MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator(); MachineBasicBlock::iterator TermI = MBBE; - // Check first terminator for VCC branches to optimize + // Check first terminator for branches to optimize if (TermI != MBB.end()) { MachineInstr &MI = *TermI; switch (MI.getOpcode()) { @@ -276,6 +361,9 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { case AMDGPU::S_CBRANCH_VCCNZ: Changed |= optimizeVccBranch(MI); continue; + case AMDGPU::S_CBRANCH_EXECZ: + Changed |= removeExeczBranch(MI, MBB); + continue; default: break; } diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp deleted file mode 100644 index 104dea8fdff5..000000000000 --- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp +++ /dev/null @@ -1,159 +0,0 @@ -//===-- SIRemoveShortExecBranches.cpp ------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// This pass optmizes the s_cbranch_execz instructions. -/// The pass removes this skip instruction for short branches, -/// if there is no unwanted sideeffect in the fallthrough code sequence. -/// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/Support/CommandLine.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-remove-short-exec-branches" - -static unsigned SkipThreshold; - -static cl::opt SkipThresholdFlag( - "amdgpu-skip-threshold", cl::Hidden, - cl::desc( - "Number of instructions before jumping over divergent control flow"), - cl::location(SkipThreshold), cl::init(12)); - -namespace { - -class SIRemoveShortExecBranches : public MachineFunctionPass { -private: - const SIInstrInfo *TII = nullptr; - bool getBlockDestinations(MachineBasicBlock &SrcMBB, - MachineBasicBlock *&TrueMBB, - MachineBasicBlock *&FalseMBB, - SmallVectorImpl &Cond); - bool mustRetainExeczBranch(const MachineBasicBlock &From, - const MachineBasicBlock &To) const; - bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); - -public: - static char ID; - - SIRemoveShortExecBranches() : MachineFunctionPass(ID) { - initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; -}; - -} // End anonymous namespace. - -INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE, - "SI remove short exec branches", false, false) - -char SIRemoveShortExecBranches::ID = 0; - -char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID; - -bool SIRemoveShortExecBranches::getBlockDestinations( - MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB, - MachineBasicBlock *&FalseMBB, SmallVectorImpl &Cond) { - if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond)) - return false; - - if (!FalseMBB) - FalseMBB = SrcMBB.getNextNode(); - - return true; -} - -bool SIRemoveShortExecBranches::mustRetainExeczBranch( - const MachineBasicBlock &From, const MachineBasicBlock &To) const { - unsigned NumInstr = 0; - const MachineFunction *MF = From.getParent(); - - for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); - MBBI != End && MBBI != ToI; ++MBBI) { - const MachineBasicBlock &MBB = *MBBI; - - for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end(); - I != E; ++I) { - // When a uniform loop is inside non-uniform control flow, the branch - // leaving the loop might never be taken when EXEC = 0. - // Hence we should retain cbranch out of the loop lest it become infinite. - if (I->isConditionalBranch()) - return true; - - if (TII->hasUnwantedEffectsWhenEXECEmpty(*I)) - return true; - - if (TII->isKillTerminator(I->getOpcode())) - return true; - - // These instructions are potentially expensive even if EXEC = 0. - if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) || - TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT) - return true; - - ++NumInstr; - if (NumInstr >= SkipThreshold) - return true; - } - } - - return false; -} - -// Returns true if the skip branch instruction is removed. -bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI, - MachineBasicBlock &SrcMBB) { - MachineBasicBlock *TrueMBB = nullptr; - MachineBasicBlock *FalseMBB = nullptr; - SmallVector Cond; - - if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond)) - return false; - - // Consider only the forward branches. - if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || - mustRetainExeczBranch(*FalseMBB, *TrueMBB)) - return false; - - LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); - MI.eraseFromParent(); - SrcMBB.removeSuccessor(TrueMBB); - - return true; -} - -bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - TII = ST.getInstrInfo(); - MF.RenumberBlocks(); - bool Changed = false; - - for (MachineBasicBlock &MBB : MF) { - MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - if (MBBI == MBB.end()) - continue; - - MachineInstr &MI = *MBBI; - switch (MI.getOpcode()) { - case AMDGPU::S_CBRANCH_EXECZ: - Changed = removeExeczBranch(MI, MBB); - break; - default: - break; - } - } - - return Changed; -} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll index f0d76065ddd5..1b8689d10a1e 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll @@ -166,12 +166,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; SI-NEXT: s_xor_b64 s[2:3], vcc, -1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB2_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB2_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -192,12 +193,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX9-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB2_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -218,12 +220,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-32-NEXT: s_xor_b32 s1, vcc_lo, -1 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB2_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -244,12 +247,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-64-NEXT: s_xor_b64 s[2:3], vcc, -1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB2_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -284,13 +288,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 BB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; SI-NEXT: s_waitcnt vmcnt(0) @@ -312,13 +317,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 BB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -340,13 +346,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s28, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) @@ -368,13 +375,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] ; GFX10-64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) @@ -416,13 +424,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 BB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] @@ -444,13 +453,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 BB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] @@ -472,13 +482,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s28, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 @@ -500,13 +511,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] @@ -660,13 +672,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] -; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: BB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[4:5] @@ -682,12 +695,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; SI-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SI-NEXT: s_xor_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_cbranch_execz BB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: BB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 @@ -706,13 +720,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: BB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -728,12 +743,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: BB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -752,13 +768,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: BB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_mov_b32 s1, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 @@ -772,12 +789,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_xor_b32 s1, s1, -1 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: BB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -796,13 +814,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-64-NEXT: s_cbranch_execz BB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: BB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -816,12 +835,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_xor_b64 s[2:3], s[2:3], -1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: BB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 @@ -883,13 +903,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[8:9], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[8:9] -; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: BB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: v_mov_b32_e32 v0, s6 @@ -948,13 +969,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: BB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], 0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -1013,13 +1035,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-32-NEXT: s_branch BB7_5 @@ -1075,13 +1098,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir index 6ce629a0dc05..7b37990dfa45 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir index 5424ad39b4d9..95b537367219 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0 --- diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir index 928324492d51..97c8b50c50cb 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=3 %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=3 %s -o - | FileCheck %s --- diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 0b0fb98cacb8..9edd1a397b78 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -167,12 +167,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; SI-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB2_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; SI-NEXT: s_cbranch_scc0 BB2_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB2_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -194,12 +195,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB2_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX9-NEXT: s_cbranch_scc0 BB2_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB2_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -221,12 +223,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-32-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s0 ; GFX10-32-NEXT: s_xor_b32 s0, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB2_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s1, s1, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB2_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB2_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-32-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -248,12 +251,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) { ; GFX10-64-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_xor_b64 s[0:1], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB2_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[2:3], s[2:3], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB2_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB2_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX10-64-NEXT: exp mrt1 v0, v0, v0, v0 done vm @@ -289,13 +293,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB3_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 BB3_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB3_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -317,13 +322,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB3_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 BB3_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB3_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -345,13 +351,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v1 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB3_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s28, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB3_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-32-NEXT: s_waitcnt vmcnt(0) @@ -373,13 +380,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v1 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB3_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB3_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB3_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] ; GFX10-64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX10-64-NEXT: s_waitcnt vmcnt(0) @@ -421,13 +429,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[14:15], vcc ; SI-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; SI-NEXT: s_cbranch_execz BB4_3 ; SI-NEXT: ; %bb.1: ; %.demote ; SI-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; SI-NEXT: s_cbranch_scc0 BB4_4 ; SI-NEXT: ; %bb.2: ; %.demote ; SI-NEXT: s_wqm_b64 s[16:17], s[12:13] ; SI-NEXT: s_and_b64 exec, exec, s[16:17] -; SI-NEXT: ; %bb.3: ; %.continue +; SI-NEXT: BB4_3: ; %.continue ; SI-NEXT: s_or_b64 exec, exec, s[14:15] ; SI-NEXT: v_add_f32_e32 v0, v0, v0 ; SI-NEXT: s_and_b64 exec, exec, s[12:13] @@ -449,13 +458,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX9-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX9-NEXT: s_xor_b64 s[14:15], exec, s[14:15] +; GFX9-NEXT: s_cbranch_execz BB4_3 ; GFX9-NEXT: ; %bb.1: ; %.demote ; GFX9-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX9-NEXT: s_cbranch_scc0 BB4_4 ; GFX9-NEXT: ; %bb.2: ; %.demote ; GFX9-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX9-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX9-NEXT: ; %bb.3: ; %.continue +; GFX9-NEXT: BB4_3: ; %.continue ; GFX9-NEXT: s_or_b64 exec, exec, s[14:15] ; GFX9-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX9-NEXT: s_and_b64 exec, exec, s[12:13] @@ -477,13 +487,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s13, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s13, exec_lo, s13 +; GFX10-32-NEXT: s_cbranch_execz BB4_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote ; GFX10-32-NEXT: s_andn2_b32 s12, s12, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-32-NEXT: ; %bb.2: ; %.demote ; GFX10-32-NEXT: s_wqm_b32 s28, s12 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s28 -; GFX10-32-NEXT: ; %bb.3: ; %.continue +; GFX10-32-NEXT: BB4_3: ; %.continue ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s13 ; GFX10-32-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s12 @@ -505,13 +516,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre ; GFX10-64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[14:15], vcc ; GFX10-64-NEXT: s_xor_b64 s[28:29], exec, s[14:15] +; GFX10-64-NEXT: s_cbranch_execz BB4_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote ; GFX10-64-NEXT: s_andn2_b64 s[12:13], s[12:13], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB4_4 ; GFX10-64-NEXT: ; %bb.2: ; %.demote ; GFX10-64-NEXT: s_wqm_b64 s[16:17], s[12:13] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[16:17] -; GFX10-64-NEXT: ; %bb.3: ; %.continue +; GFX10-64-NEXT: BB4_3: ; %.continue ; GFX10-64-NEXT: s_or_b64 exec, exec, s[28:29] ; GFX10-64-NEXT: v_add_f32_e32 v0, v0, v0 ; GFX10-64-NEXT: s_and_b64 exec, exec, s[12:13] @@ -659,13 +671,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB6_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB6_7 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[4:5], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[4:5] -; SI-NEXT: ; %bb.3: ; %.continue0 +; SI-NEXT: BB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], s[0:1] ; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -681,12 +694,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc ; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB6_7 ; SI-NEXT: ; %bb.5: ; %.demote1 ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.6: ; %.continue1 +; SI-NEXT: BB6_6: ; %.continue1 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: v_bfrev_b32_e32 v0, 60 ; SI-NEXT: v_mov_b32_e32 v1, 0x3c00 @@ -705,13 +719,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz BB6_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB6_7 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX9-NEXT: ; %bb.3: ; %.continue0 +; GFX9-NEXT: BB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -727,12 +742,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB6_7 ; GFX9-NEXT: ; %bb.5: ; %.demote1 ; GFX9-NEXT: s_mov_b64 exec, 0 -; GFX9-NEXT: ; %bb.6: ; %.continue1 +; GFX9-NEXT: BB6_6: ; %.continue1 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX9-NEXT: v_bfrev_b32_e32 v1, 60 @@ -751,13 +767,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-32-NEXT: s_cbranch_execz BB6_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s2, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s2 -; GFX10-32-NEXT: ; %bb.3: ; %.continue0 +; GFX10-32-NEXT: BB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_mov_b32 s1, s0 ; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 @@ -771,12 +788,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo ; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 ; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-32-NEXT: ; %bb.5: ; %.demote1 ; GFX10-32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-32-NEXT: ; %bb.6: ; %.continue1 +; GFX10-32-NEXT: BB6_6: ; %.continue1 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-32-NEXT: v_bfrev_b32_e32 v1, 60 @@ -795,13 +813,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-64-NEXT: s_cbranch_execz BB6_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5] -; GFX10-64-NEXT: ; %bb.3: ; %.continue0 +; GFX10-64-NEXT: BB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] ; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] @@ -815,12 +834,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] ; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB6_7 ; GFX10-64-NEXT: ; %bb.5: ; %.demote1 ; GFX10-64-NEXT: s_mov_b64 exec, 0 -; GFX10-64-NEXT: ; %bb.6: ; %.continue1 +; GFX10-64-NEXT: BB6_6: ; %.continue1 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00 ; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60 @@ -875,13 +895,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB7_3 ; SI-NEXT: ; %bb.1: ; %.demote0 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB7_9 ; SI-NEXT: ; %bb.2: ; %.demote0 ; SI-NEXT: s_wqm_b64 s[6:7], s[0:1] ; SI-NEXT: s_and_b64 exec, exec, s[6:7] -; SI-NEXT: ; %bb.3: ; %.continue0.preheader +; SI-NEXT: BB7_3: ; %.continue0.preheader ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b64 s[4:5], 0 ; SI-NEXT: s_branch BB7_5 @@ -940,13 +961,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX9-NEXT: s_cbranch_execz BB7_3 ; GFX9-NEXT: ; %bb.1: ; %.demote0 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX9-NEXT: s_cbranch_scc0 BB7_9 ; GFX9-NEXT: ; %bb.2: ; %.demote0 ; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX9-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX9-NEXT: BB7_3: ; %.continue0.preheader ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-NEXT: s_branch BB7_5 @@ -1005,13 +1027,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX10-32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-32-NEXT: s_cbranch_execz BB7_3 ; GFX10-32-NEXT: ; %bb.1: ; %.demote0 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-32-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-32-NEXT: ; %bb.2: ; %.demote0 ; GFX10-32-NEXT: s_wqm_b32 s3, s0 ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3 -; GFX10-32-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-32-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-32-NEXT: s_mov_b32 s2, 0 ; GFX10-32-NEXT: s_branch BB7_5 @@ -1067,13 +1090,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-64-NEXT: s_cbranch_execz BB7_3 ; GFX10-64-NEXT: ; %bb.1: ; %.demote0 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-64-NEXT: s_cbranch_scc0 BB7_9 ; GFX10-64-NEXT: ; %bb.2: ; %.demote0 ; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1] ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7] -; GFX10-64-NEXT: ; %bb.3: ; %.continue0.preheader +; GFX10-64-NEXT: BB7_3: ; %.continue0.preheader ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-64-NEXT: s_mov_b64 s[4:5], 0 ; GFX10-64-NEXT: s_branch BB7_5 diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir index 0f0d210799a9..3dddb0fef230 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir +++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are not removed around mode defs. # FIXME: -amdgpu-skip-threshold seems to be backwards. diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir index ee72fa99a129..58b1ab9ace01 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir +++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are not removed around mode defs. # FIXME: -amdgpu-skip-threshold seems to be backwards. diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir index 5979720d0cc7..4c53c51d1ce4 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir +++ b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=1000000 -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=1000000 -o - %s | FileCheck %s --- name: skip_branch_taildup_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll index f535e28c6718..690fe5a7e683 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1002,13 +1002,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz BB13_3 ; SI-NEXT: ; %bb.1: ; %bb3 ; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc ; SI-NEXT: s_cbranch_scc0 BB13_6 ; SI-NEXT: ; %bb.2: ; %bb3 ; SI-NEXT: s_andn2_b64 exec, exec, vcc -; SI-NEXT: ; %bb.3: ; %bb4 +; SI-NEXT: BB13_3: ; %bb4 ; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: s_mov_b32 s2, s0 @@ -1043,13 +1044,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX10-WAVE64-NEXT: s_mov_b32 s0, 0 ; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-WAVE64-NEXT: s_cbranch_execz BB13_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 ; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc ; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB13_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc -; GFX10-WAVE64-NEXT: ; %bb.3: ; %bb4 +; GFX10-WAVE64-NEXT: BB13_3: ; %bb4 ; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10-WAVE64-NEXT: s_mov_b32 s1, s0 ; GFX10-WAVE64-NEXT: s_mov_b32 s2, s0 @@ -1082,13 +1084,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2, ; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX10-WAVE32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-WAVE32-NEXT: s_cbranch_execz BB13_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3 ; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 ; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 BB13_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3 ; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo -; GFX10-WAVE32-NEXT: ; %bb.3: ; %bb4 +; GFX10-WAVE32-NEXT: BB13_3: ; %bb4 ; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX10-WAVE32-NEXT: s_mov_b32 s1, s0 ; GFX10-WAVE32-NEXT: s_mov_b32 s2, s0 @@ -1154,12 +1157,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; SI-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; SI-NEXT: s_cbranch_execz BB14_3 ; SI-NEXT: ; %bb.1: ; %kill ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; SI-NEXT: s_cbranch_scc0 BB14_6 ; SI-NEXT: ; %bb.2: ; %kill ; SI-NEXT: s_mov_b64 exec, 0 -; SI-NEXT: ; %bb.3: ; %Flow +; SI-NEXT: BB14_3: ; %Flow ; SI-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] ; SI-NEXT: ; implicit-def: $vgpr2 ; SI-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -1190,12 +1194,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE64-NEXT: v_cmp_ge_f32_e32 vcc, 0, v1 ; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10-WAVE64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX10-WAVE64-NEXT: s_cbranch_execz BB14_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %kill ; GFX10-WAVE64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec ; GFX10-WAVE64-NEXT: s_cbranch_scc0 BB14_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE64-NEXT: s_mov_b64 exec, 0 -; GFX10-WAVE64-NEXT: ; %bb.3: ; %Flow +; GFX10-WAVE64-NEXT: BB14_3: ; %Flow ; GFX10-WAVE64-NEXT: s_or_saveexec_b64 s[0:1], s[2:3] ; GFX10-WAVE64-NEXT: ; implicit-def: $vgpr2 ; GFX10-WAVE64-NEXT: s_xor_b64 exec, exec, s[0:1] @@ -1226,12 +1231,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) { ; GFX10-WAVE32-NEXT: v_cmp_ge_f32_e32 vcc_lo, 0, v1 ; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX10-WAVE32-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-WAVE32-NEXT: s_cbranch_execz BB14_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %kill ; GFX10-WAVE32-NEXT: s_andn2_b32 s0, s0, exec_lo ; GFX10-WAVE32-NEXT: s_cbranch_scc0 BB14_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %kill ; GFX10-WAVE32-NEXT: s_mov_b32 exec_lo, 0 -; GFX10-WAVE32-NEXT: ; %bb.3: ; %Flow +; GFX10-WAVE32-NEXT: BB14_3: ; %Flow ; GFX10-WAVE32-NEXT: s_or_saveexec_b32 s0, s1 ; GFX10-WAVE32-NEXT: ; implicit-def: $vgpr2 ; GFX10-WAVE32-NEXT: s_xor_b32 exec_lo, exec_lo, s0 diff --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll index 9f42347f5ec8..e5a019e5d04a 100644 --- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll +++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll @@ -108,25 +108,26 @@ define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(floa ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5 ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GCN: bb.4.Flow1: - ; GCN: successors: %bb.5(0x40000000) + ; GCN: successors: %bb.5(0x40000000), %bb.7(0x40000000) ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN: renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc + ; GCN: S_CBRANCH_EXECZ %bb.7, implicit $exec ; GCN: bb.5.kill0: - ; GCN: successors: %bb.8(0x40000000), %bb.7(0x40000000) + ; GCN: successors: %bb.6(0x40000000), %bb.8(0x40000000) ; GCN: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3 ; GCN: dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc - ; GCN: S_CBRANCH_SCC0 %bb.7, implicit $scc - ; GCN: bb.8.kill0: - ; GCN: successors: %bb.6(0x80000000) + ; GCN: S_CBRANCH_SCC0 %bb.8, implicit $scc + ; GCN: bb.6.kill0: + ; GCN: successors: %bb.7(0x80000000) ; GCN: liveins: $sgpr2_sgpr3, $scc ; GCN: $exec = S_MOV_B64 0 - ; GCN: bb.6.end: + ; GCN: bb.7.end: ; GCN: successors: %bb.9(0x80000000) ; GCN: liveins: $sgpr2_sgpr3 ; GCN: $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc ; GCN: S_BRANCH %bb.9 - ; GCN: bb.7: + ; GCN: bb.8: ; GCN: $exec = S_MOV_B64 0 ; GCN: EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec ; GCN: S_ENDPGM 0 diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 6b50aa58bd6b..3693e706bec4 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -222,7 +222,6 @@ static_library("LLVMAMDGPUCodeGen") { "SIPreEmitPeephole.cpp", "SIProgramInfo.cpp", "SIRegisterInfo.cpp", - "SIRemoveShortExecBranches.cpp", "SIShrinkInstructions.cpp", "SIWholeQuadMode.cpp", ] -- GitLab From fe5f4c397f029b66a541b25d4749496785f2d4f5 Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Sat, 20 Mar 2021 11:27:52 +0900 Subject: [PATCH 0232/1000] [AMDGPU] Rename SIInsertSkips Pass Pass no longer handles skips. Pass now removes unnecessary unconditional branches and lowers early termination branches. Hence rename to SILateBranchLowering. Move code to handle returns to epilog from SIPreEmitPeephole into SILateBranchLowering. This means SIPreEmitPeephole only contains optional optimisations, and all required transforms are in SILateBranchLowering. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D98915 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 4 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 +- llvm/lib/Target/AMDGPU/CMakeLists.txt | 2 +- ...sertSkips.cpp => SILateBranchLowering.cpp} | 124 +++++++++++------- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 30 ----- llvm/test/CodeGen/AMDGPU/early-term.mir | 2 +- .../test/CodeGen/AMDGPU/kill-infinite-loop.ll | 2 +- llvm/test/CodeGen/AMDGPU/readlane_exec0.mir | 2 +- llvm/test/CodeGen/AMDGPU/shrink-carry.mir | 2 +- llvm/test/CodeGen/AMDGPU/syncscopes.ll | 2 +- .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 2 +- 11 files changed, 88 insertions(+), 91 deletions(-) rename llvm/lib/Target/AMDGPU/{SIInsertSkips.cpp => SILateBranchLowering.cpp} (60%) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 4f9f888506b7..4b0367501ae0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -207,8 +207,8 @@ extern char &SILowerControlFlowID; void initializeSIPreEmitPeepholePass(PassRegistry &); extern char &SIPreEmitPeepholeID; -void initializeSIInsertSkipsPass(PassRegistry &); -extern char &SIInsertSkipsPassID; +void initializeSILateBranchLoweringPass(PassRegistry &); +extern char &SILateBranchLoweringPassID; void initializeSIOptimizeExecMaskingPass(PassRegistry &); extern char &SIOptimizeExecMaskingID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 2b42f9e1281e..ceabee546eba 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -250,7 +250,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); initializeSIPreEmitPeepholePass(*PR); - initializeSIInsertSkipsPass(*PR); + initializeSILateBranchLoweringPass(*PR); initializeSIMemoryLegalizerPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); initializeSIPreAllocateWWMRegsPass(*PR); @@ -1214,8 +1214,9 @@ void GCNPassConfig::addPreEmitPass() { if (getOptLevel() > CodeGenOpt::None) addPass(&SIInsertHardClausesID); - addPass(&SIInsertSkipsPassID); - addPass(&SIPreEmitPeepholeID); + addPass(&SILateBranchLoweringPassID); + if (getOptLevel() > CodeGenOpt::None) + addPass(&SIPreEmitPeepholeID); // The hazard recognizer that runs as part of the post-ra scheduler does not // guarantee to be able handle all hazards correctly. This is because if there // are multiple scheduling regions in a basic block, the regions are scheduled diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 03b0c0f45f2d..0688336cec2d 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -119,7 +119,7 @@ add_llvm_target(AMDGPUCodeGen SIFormMemoryClauses.cpp SIFrameLowering.cpp SIInsertHardClauses.cpp - SIInsertSkips.cpp + SILateBranchLowering.cpp SIInsertWaitcnts.cpp SIInstrInfo.cpp SIISelLowering.cpp diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp similarity index 60% rename from llvm/lib/Target/AMDGPU/SIInsertSkips.cpp rename to llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index 439453e53548..42cc09f8e484 100644 --- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -1,4 +1,4 @@ -//===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===// +//===-- SILateBranchLowering.cpp - Final preparation of branches ----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,28 +14,23 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "llvm/ADT/DepthFirstIterator.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/InitializePasses.h" using namespace llvm; -#define DEBUG_TYPE "si-insert-skips" +#define DEBUG_TYPE "si-late-branch-lowering" namespace { -class SIInsertSkips : public MachineFunctionPass { +class SILateBranchLowering : public MachineFunctionPass { private: const SIRegisterInfo *TRI = nullptr; const SIInstrInfo *TII = nullptr; MachineDominatorTree *MDT = nullptr; - MachineBasicBlock *EarlyExitBlock = nullptr; - bool EarlyExitClearsExec = false; - - void ensureEarlyExitBlock(MachineBasicBlock &MBB, bool ClearExec); - - void earlyTerm(MachineInstr &MI); + void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock); public: static char ID; @@ -43,12 +38,12 @@ public: unsigned MovOpc; Register ExecReg; - SIInsertSkips() : MachineFunctionPass(ID) {} + SILateBranchLowering() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { - return "SI insert s_cbranch_execz instructions"; + return "SI Final Branch Preparation"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -60,15 +55,15 @@ public: } // end anonymous namespace -char SIInsertSkips::ID = 0; +char SILateBranchLowering::ID = 0; -INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE, +INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) -INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE, +INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE, "SI insert s_cbranch_execz instructions", false, false) -char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID; +char &llvm::SILateBranchLoweringPassID = SILateBranchLowering::ID; static void generateEndPgm(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, @@ -89,27 +84,6 @@ static void generateEndPgm(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0); } -void SIInsertSkips::ensureEarlyExitBlock(MachineBasicBlock &MBB, - bool ClearExec) { - MachineFunction *MF = MBB.getParent(); - DebugLoc DL; - - if (!EarlyExitBlock) { - EarlyExitBlock = MF->CreateMachineBasicBlock(); - MF->insert(MF->end(), EarlyExitBlock); - generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, - MF->getFunction().getCallingConv() == - CallingConv::AMDGPU_PS); - EarlyExitClearsExec = false; - } - - if (ClearExec && !EarlyExitClearsExec) { - auto ExitI = EarlyExitBlock->getFirstNonPHI(); - BuildMI(*EarlyExitBlock, ExitI, DL, TII->get(MovOpc), ExecReg).addImm(0); - EarlyExitClearsExec = true; - } -} - static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MachineDominatorTree *MDT) { MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/ true); @@ -125,12 +99,11 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI, MDT->getBase().applyUpdates(DTUpdates); } -void SIInsertSkips::earlyTerm(MachineInstr &MI) { +void SILateBranchLowering::earlyTerm(MachineInstr &MI, + MachineBasicBlock *EarlyExitBlock) { MachineBasicBlock &MBB = *MI.getParent(); const DebugLoc DL = MI.getDebugLoc(); - ensureEarlyExitBlock(MBB, true); - auto BranchMI = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC0)) .addMBB(EarlyExitBlock); auto Next = std::next(MI.getIterator()); @@ -142,7 +115,7 @@ void SIInsertSkips::earlyTerm(MachineInstr &MI) { MDT->getBase().insertEdge(&MBB, EarlyExitBlock); } -bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { +bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); @@ -152,6 +125,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; SmallVector EarlyTermInstrs; + SmallVector EpilogInstrs; bool MadeChange = false; for (MachineBasicBlock &MBB : MF) { @@ -163,7 +137,7 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { switch (MI.getOpcode()) { case AMDGPU::S_BRANCH: // Optimize out branches to the next block. - // FIXME: Shouldn't this be handled by BranchFolding? + // This only occurs in -O0 when BranchFolding is not executed. if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) { assert(&MI == &MBB.back()); MI.eraseFromParent(); @@ -175,20 +149,72 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) { EarlyTermInstrs.push_back(&MI); break; + case AMDGPU::SI_RETURN_TO_EPILOG: + EpilogInstrs.push_back(&MI); + break; + default: break; } } } - for (MachineInstr *Instr : EarlyTermInstrs) { - // Early termination in GS does nothing - if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) - earlyTerm(*Instr); - Instr->eraseFromParent(); + // Lower any early exit branches first + if (!EarlyTermInstrs.empty()) { + MachineBasicBlock *EarlyExitBlock = MF.CreateMachineBasicBlock(); + DebugLoc DL; + + MF.insert(MF.end(), EarlyExitBlock); + BuildMI(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII->get(MovOpc), + ExecReg) + .addImm(0); + generateEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII, + MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS); + + for (MachineInstr *Instr : EarlyTermInstrs) { + // Early termination in GS does nothing + if (MF.getFunction().getCallingConv() != CallingConv::AMDGPU_GS) + earlyTerm(*Instr, EarlyExitBlock); + Instr->eraseFromParent(); + } + + EarlyTermInstrs.clear(); + MadeChange = true; + } + + // Now check return to epilog instructions occur at function end + if (!EpilogInstrs.empty()) { + MachineBasicBlock *EmptyMBBAtEnd = nullptr; + assert(!MF.getInfo()->returnsVoid()); + + // If there are multiple returns to epilog then all will + // become jumps to new empty end block. + if (EpilogInstrs.size() > 1) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + for (auto MI : EpilogInstrs) { + auto MBB = MI->getParent(); + if (MBB == &MF.back() && MI == &MBB->back()) + continue; + + // SI_RETURN_TO_EPILOG is not the last instruction. + // Jump to empty block at function end. + if (!EmptyMBBAtEnd) { + EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); + MF.insert(MF.end(), EmptyMBBAtEnd); + } + + MBB->addSuccessor(EmptyMBBAtEnd); + BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) + .addMBB(EmptyMBBAtEnd); + MI->eraseFromParent(); + MadeChange = true; + } + + EpilogInstrs.clear(); } - EarlyTermInstrs.clear(); - EarlyExitBlock = nullptr; return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 93d33fddff52..cc06cd8ae717 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -14,7 +14,6 @@ #include "AMDGPU.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFunctionPass.h" using namespace llvm; @@ -345,7 +344,6 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); TRI = &TII->getRegisterInfo(); - MachineBasicBlock *EmptyMBBAtEnd = nullptr; bool Changed = false; MF.RenumberBlocks(); @@ -368,34 +366,6 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) { break; } } - // Check all terminators for SI_RETURN_TO_EPILOG - // FIXME: This is not an optimization and should be moved somewhere else. - while (TermI != MBB.end()) { - MachineInstr &MI = *TermI; - if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) { - assert(!MF.getInfo()->returnsVoid()); - - // Graphics shaders returning non-void shouldn't contain S_ENDPGM, - // because external bytecode will be appended at the end. - if (&MBB != &MF.back() || &MI != &MBB.back()) { - // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block - // at the end and jump there. - if (!EmptyMBBAtEnd) { - EmptyMBBAtEnd = MF.CreateMachineBasicBlock(); - MF.insert(MF.end(), EmptyMBBAtEnd); - } - - MBB.addSuccessor(EmptyMBBAtEnd); - BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) - .addMBB(EmptyMBBAtEnd); - MI.eraseFromParent(); - MBBE = MBB.getFirstTerminator(); - TermI = MBBE; - continue; - } - } - TermI++; - } if (!ST.hasVGPRIndexMode()) continue; diff --git a/llvm/test/CodeGen/AMDGPU/early-term.mir b/llvm/test/CodeGen/AMDGPU/early-term.mir index fc896c54512e..39ff92bd5819 100644 --- a/llvm/test/CodeGen/AMDGPU/early-term.mir +++ b/llvm/test/CodeGen/AMDGPU/early-term.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-insert-skips -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-late-branch-lowering -verify-machineinstrs %s -o - | FileCheck %s --- | define amdgpu_ps void @early_term_scc0_end_block() { diff --git a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll index 702ba881be89..8c9bdff36873 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-infinite-loop.ll @@ -50,7 +50,7 @@ end: ; CHECK-LABEL: only_kill ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm -; SIInsertSkips inserts an extra null export here, but it should be harmless. +; SILateBranchLowering inserts an extra null export here, but it should be harmless. ; CHECK: exp null off, off, off, off done vm ; CHECK-NEXT: s_endpgm define amdgpu_ps void @only_kill() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir b/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir index e7660a14de91..597ac24cc533 100644 --- a/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir +++ b/llvm/test/CodeGen/AMDGPU/readlane_exec0.mir @@ -1,4 +1,4 @@ -# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-skips -verify-machineinstrs | FileCheck -check-prefix=GCN %s +# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-late-branch-lowering -verify-machineinstrs | FileCheck -check-prefix=GCN %s # GCN-LABEL: readlane_exec0 # GCN: bb.0 diff --git a/llvm/test/CodeGen/AMDGPU/shrink-carry.mir b/llvm/test/CodeGen/AMDGPU/shrink-carry.mir index d828f0be4319..74b81d8d29a3 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-carry.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-carry.mir @@ -1,4 +1,4 @@ -# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-insert-skips -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -start-before si-shrink-instructions -stop-before si-late-branch-lowering -o - %s | FileCheck -check-prefix=GCN %s # GCN-LABEL: name: subbrev{{$}} # GCN: V_SUBBREV_U32_e32 0, undef $vgpr0, implicit-def $vcc, implicit killed $vcc, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/syncscopes.ll b/llvm/test/CodeGen/AMDGPU/syncscopes.ll index e78967bbf8ca..2a7c87ea3385 100644 --- a/llvm/test/CodeGen/AMDGPU/syncscopes.ll +++ b/llvm/test/CodeGen/AMDGPU/syncscopes.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-after=si-insert-skips < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -stop-after=si-late-branch-lowering < %s | FileCheck --check-prefix=GCN %s ; GCN-LABEL: name: syncscopes ; GCN: FLAT_STORE_DWORD killed renamable $vgpr1_vgpr2, killed renamable $vgpr0, 0, 0, implicit $exec, implicit $flat_scr :: (store syncscope("agent") seq_cst 4 into %ir.agent_out) diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 3693e706bec4..cd6cac3a5b70 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -203,7 +203,7 @@ static_library("LLVMAMDGPUCodeGen") { "SIFrameLowering.cpp", "SIISelLowering.cpp", "SIInsertHardClauses.cpp", - "SIInsertSkips.cpp", + "SILateBranchLowering.cpp", "SIInsertWaitcnts.cpp", "SIInstrInfo.cpp", "SILoadStoreOptimizer.cpp", -- GitLab From 28d58d8fe2094af6902dee7b4d68ec30a3e9d737 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 20:35:59 -0700 Subject: [PATCH 0233/1000] [Driver] Stop searching other prefixes once a GCC installation is found in one prefix so that when --sysroot is specified, the detected GCC installation will not be overridden by another from /usr which happens to have a larger version. This behavior is particularly inconvenient when the system has a larger version GCC while the user wants to try out an older sysroot. Delete some tests from linux-ld.c which overlap with cross-linux.c --- clang/lib/Driver/ToolChains/Gnu.cpp | 7 +- clang/test/Driver/linux-ld.c | 63 +----------------- clang/unittests/Driver/ToolChainTest.cpp | 82 ++++++++++++++++++------ 3 files changed, 71 insertions(+), 81 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 38971288e38f..eb32f4b920b5 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -1955,7 +1955,8 @@ void Generic_GCC::GCCInstallationDetector::init( // Loop over the various components which exist and select the best GCC // installation available. GCC installs are ranked by version number. - Version = GCCVersion::Parse("0.0.0"); + const GCCVersion VersionZero = GCCVersion::Parse("0.0.0"); + Version = VersionZero; for (const std::string &Prefix : Prefixes) { auto &VFS = D.getVFS(); if (!VFS.exists(Prefix)) @@ -1988,6 +1989,10 @@ void Generic_GCC::GCCInstallationDetector::init( ScanLibDirForGCCTriple(TargetTriple, Args, LibDir, Candidate, true, GCCDirExists, GCCCrossDirExists); } + + // Skip other prefixes once a GCC installation is found. + if (Version > VersionZero) + break; } } diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c index eba09d2970cc..1aa955737438 100644 --- a/clang/test/Driver/linux-ld.c +++ b/clang/test/Driver/linux-ld.c @@ -507,28 +507,6 @@ // CHECK-64-TO-32-SYSROOT: "-L[[SYSROOT]]/lib" // CHECK-64-TO-32-SYSROOT: "-L[[SYSROOT]]/usr/lib" // -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ -// RUN: --target=i386-unknown-linux -rtlib=platform -m32 \ -// RUN: -ccc-install-dir %S/Inputs/fake_install_tree/bin \ -// RUN: --gcc-toolchain="" \ -// RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-INSTALL-DIR-32 %s -// CHECK-INSTALL-DIR-32: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-INSTALL-DIR-32: "{{.*}}/Inputs/fake_install_tree/bin/../lib/gcc/i386-unknown-linux/4.7.0{{/|\\\\}}crtbegin.o" -// CHECK-INSTALL-DIR-32: "-L{{.*}}/Inputs/fake_install_tree/bin/../lib/gcc/i386-unknown-linux/4.7.0" -// -// Check that with 64-bit builds, we don't actually use the install directory -// as its version of GCC is lower than our sysrooted version. -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ -// RUN: --target=x86_64-unknown-linux -rtlib=platform -m64 \ -// RUN: -ccc-install-dir %S/Inputs/fake_install_tree/bin \ -// RUN: --gcc-toolchain="" \ -// RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-INSTALL-DIR-64 %s -// CHECK-INSTALL-DIR-64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-INSTALL-DIR-64: "{{.*}}/usr/lib/gcc/x86_64-unknown-linux/4.6.0{{/|\\\\}}crtbegin.o" -// CHECK-INSTALL-DIR-64: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-unknown-linux/4.6.0" -// // Check that we support unusual patch version formats, including missing that // component. // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ @@ -538,45 +516,8 @@ // RUN: --sysroot=%S/Inputs/basic_linux_tree \ // RUN: | FileCheck --check-prefix=CHECK-GCC-VERSION1 %s // CHECK-GCC-VERSION1: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-GCC-VERSION1: "{{.*}}/Inputs/gcc_version_parsing1/bin/../lib/gcc/i386-unknown-linux/4.7{{/|\\\\}}crtbegin.o" -// CHECK-GCC-VERSION1: "-L{{.*}}/Inputs/gcc_version_parsing1/bin/../lib/gcc/i386-unknown-linux/4.7" -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ -// RUN: --target=i386-unknown-linux -rtlib=platform -m32 \ -// RUN: -ccc-install-dir %S/Inputs/gcc_version_parsing2/bin \ -// RUN: --gcc-toolchain="" \ -// RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-GCC-VERSION2 %s -// CHECK-GCC-VERSION2: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-GCC-VERSION2: "{{.*}}/Inputs/gcc_version_parsing2/bin/../lib/gcc/i386-unknown-linux/4.7.x{{/|\\\\}}crtbegin.o" -// CHECK-GCC-VERSION2: "-L{{.*}}/Inputs/gcc_version_parsing2/bin/../lib/gcc/i386-unknown-linux/4.7.x" -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ -// RUN: --target=i386-unknown-linux -rtlib=platform -m32 \ -// RUN: -ccc-install-dir %S/Inputs/gcc_version_parsing3/bin \ -// RUN: --gcc-toolchain="" \ -// RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-GCC-VERSION3 %s -// CHECK-GCC-VERSION3: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-GCC-VERSION3: "{{.*}}/Inputs/gcc_version_parsing3/bin/../lib/gcc/i386-unknown-linux/4.7.99-rc5{{/|\\\\}}crtbegin.o" -// CHECK-GCC-VERSION3: "-L{{.*}}/Inputs/gcc_version_parsing3/bin/../lib/gcc/i386-unknown-linux/4.7.99-rc5" -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ -// RUN: --target=i386-unknown-linux -rtlib=platform -m32 \ -// RUN: -ccc-install-dir %S/Inputs/gcc_version_parsing4/bin \ -// RUN: --gcc-toolchain="" \ -// RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-GCC-VERSION4 %s -// CHECK-GCC-VERSION4: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-GCC-VERSION4: "{{.*}}/Inputs/gcc_version_parsing4/bin/../lib/gcc/i386-unknown-linux/4.7.99{{/|\\\\}}crtbegin.o" -// CHECK-GCC-VERSION4: "-L{{.*}}/Inputs/gcc_version_parsing4/bin/../lib/gcc/i386-unknown-linux/4.7.99" -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ -// RUN: --target=i386-unknown-linux -rtlib=platform -m32 \ -// RUN: -ccc-install-dir %S/Inputs/gcc_version_parsing5/bin \ -// RUN: --gcc-toolchain="" \ -// RUN: --sysroot=%S/Inputs/basic_linux_tree \ -// RUN: | FileCheck --check-prefix=CHECK-GCC-VERSION5 %s -// CHECK-GCC-VERSION5: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-GCC-VERSION5: "{{.*}}/Inputs/gcc_version_parsing5/bin/../lib/gcc/i386-unknown-linux/5{{/|\\\\}}crtbegin.o" -// CHECK-GCC-VERSION5: "-L{{.*}}/Inputs/gcc_version_parsing5/bin/../lib/gcc/i386-unknown-linux/5" -// +// CHECK-GCC-VERSION1: "{{.*}}/Inputs/basic_linux_tree/usr/lib/gcc/i386-unknown-linux/4.6.0{{/|\\\\}}crtbegin.o" + // Test a simulated installation of libc++ on Linux, both through sysroot and // the installation path of Clang. // RUN: %clangxx -no-canonical-prefixes -x c++ %s -### -o %t.o 2>&1 \ diff --git a/clang/unittests/Driver/ToolChainTest.cpp b/clang/unittests/Driver/ToolChainTest.cpp index 35060563ab97..87f476c98dc9 100644 --- a/clang/unittests/Driver/ToolChainTest.cpp +++ b/clang/unittests/Driver/ToolChainTest.cpp @@ -31,11 +31,8 @@ TEST(ToolChainTest, VFSGCCInstallation) { IntrusiveRefCntPtr DiagID(new DiagnosticIDs()); struct TestDiagnosticConsumer : public DiagnosticConsumer {}; - DiagnosticsEngine Diags(DiagID, &*DiagOpts, new TestDiagnosticConsumer); IntrusiveRefCntPtr InMemoryFileSystem( new llvm::vfs::InMemoryFileSystem); - Driver TheDriver("/bin/clang", "arm-linux-gnueabihf", Diags, - "clang LLVM compiler", InMemoryFileSystem); const char *EmptyFiles[] = { "foo.cpp", @@ -53,31 +50,78 @@ TEST(ToolChainTest, VFSGCCInstallation) { "/usr/include/arm-linux-gnueabi/.keep", "/usr/include/arm-linux-gnueabihf/.keep", "/lib/arm-linux-gnueabi/.keep", - "/lib/arm-linux-gnueabihf/.keep"}; + "/lib/arm-linux-gnueabihf/.keep", + + "/sysroot/usr/lib/gcc/arm-linux-gnueabi/4.5.1/crtbegin.o", + "/sysroot/usr/lib/gcc/arm-linux-gnueabi/4.5.1/crtend.o", + "/sysroot/usr/lib/gcc/arm-linux-gnueabihf/4.5.3/crtbegin.o", + "/sysroot/usr/lib/gcc/arm-linux-gnueabihf/4.5.3/crtend.o", + "/sysroot/usr/lib/arm-linux-gnueabi/crt1.o", + "/sysroot/usr/lib/arm-linux-gnueabi/crti.o", + "/sysroot/usr/lib/arm-linux-gnueabi/crtn.o", + "/sysroot/usr/lib/arm-linux-gnueabihf/crt1.o", + "/sysroot/usr/lib/arm-linux-gnueabihf/crti.o", + "/sysroot/usr/lib/arm-linux-gnueabihf/crtn.o", + "/sysroot/usr/include/arm-linux-gnueabi/.keep", + "/sysroot/usr/include/arm-linux-gnueabihf/.keep", + "/sysroot/lib/arm-linux-gnueabi/.keep", + "/sysroot/lib/arm-linux-gnueabihf/.keep", + }; for (const char *Path : EmptyFiles) InMemoryFileSystem->addFile(Path, 0, llvm::MemoryBuffer::getMemBuffer("\n")); - std::unique_ptr C(TheDriver.BuildCompilation( - {"-fsyntax-only", "--gcc-toolchain=", "--sysroot=", "foo.cpp"})); - EXPECT_TRUE(C); - - std::string S; { - llvm::raw_string_ostream OS(S); - C->getDefaultToolChain().printVerboseInfo(OS); + DiagnosticsEngine Diags(DiagID, &*DiagOpts, new TestDiagnosticConsumer); + Driver TheDriver("/bin/clang", "arm-linux-gnueabihf", Diags, + "clang LLVM compiler", InMemoryFileSystem); + std::unique_ptr C(TheDriver.BuildCompilation( + {"-fsyntax-only", "--gcc-toolchain=", "--sysroot=", "foo.cpp"})); + ASSERT_TRUE(C); + std::string S; + { + llvm::raw_string_ostream OS(S); + C->getDefaultToolChain().printVerboseInfo(OS); + } +#if _WIN32 + std::replace(S.begin(), S.end(), '\\', '/'); +#endif + EXPECT_EQ( + "Found candidate GCC installation: " + "/usr/lib/gcc/arm-linux-gnueabihf/4.6.3\n" + "Selected GCC installation: /usr/lib/gcc/arm-linux-gnueabihf/4.6.3\n" + "Candidate multilib: .;@m32\n" + "Selected multilib: .;@m32\n", + S); } + + { + DiagnosticsEngine Diags(DiagID, &*DiagOpts, new TestDiagnosticConsumer); + Driver TheDriver("/bin/clang", "arm-linux-gnueabihf", Diags, + "clang LLVM compiler", InMemoryFileSystem); + std::unique_ptr C(TheDriver.BuildCompilation( + {"-fsyntax-only", "--gcc-toolchain=", "--sysroot=/sysroot", + "foo.cpp"})); + ASSERT_TRUE(C); + std::string S; + { + llvm::raw_string_ostream OS(S); + C->getDefaultToolChain().printVerboseInfo(OS); + } #if _WIN32 - std::replace(S.begin(), S.end(), '\\', '/'); + std::replace(S.begin(), S.end(), '\\', '/'); #endif - EXPECT_EQ( - "Found candidate GCC installation: " - "/usr/lib/gcc/arm-linux-gnueabihf/4.6.3\n" - "Selected GCC installation: /usr/lib/gcc/arm-linux-gnueabihf/4.6.3\n" - "Candidate multilib: .;@m32\n" - "Selected multilib: .;@m32\n", - S); + // Test that 4.5.3 from --sysroot is not overridden by 4.6.3 (larger + // version) from /usr. + EXPECT_EQ("Found candidate GCC installation: " + "/sysroot/usr/lib/gcc/arm-linux-gnueabihf/4.5.3\n" + "Selected GCC installation: " + "/sysroot/usr/lib/gcc/arm-linux-gnueabihf/4.5.3\n" + "Candidate multilib: .;@m32\n" + "Selected multilib: .;@m32\n", + S); + } } TEST(ToolChainTest, VFSGCCInstallationRelativeDir) { -- GitLab From d5c1d305b33c02168c43da92acfb11a2376f9388 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 19 Mar 2021 20:39:48 -0700 Subject: [PATCH 0234/1000] [RISCV] Rename WriteShift/ReadShift scheduler classes to WriteShiftImm/ReadShiftImm. Move variable shifts from WriteIALU/ReadIALU to new WriteShiftReg/ReadShiftReg. Previously only immediate shifts were in WriteShift. Register shifts were grouped with IALU. Seems likely that immediate shifts would be as fast or faster than register shifts. And that immediate shifts wouldn't be any faster than IALU. So if any deserved to be in their own group it should be register shifts not immediate shifts. Rather than try to flip them let's just add more granularity and give each kind their own class. I've used new names for both to make them unambiguous and to force any downstream implementations to be forced to put correct information in their scheduler models. Reviewed By: evandro Differential Revision: https://reviews.llvm.org/D98911 --- llvm/lib/Target/RISCV/RISCVInstrInfo.td | 16 ++++++++-------- llvm/lib/Target/RISCV/RISCVInstrInfoC.td | 14 +++++++------- llvm/lib/Target/RISCV/RISCVSchedRocket.td | 12 ++++++++---- llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 12 ++++++++---- llvm/lib/Target/RISCV/RISCVSchedule.td | 12 ++++++++---- 5 files changed, 39 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 0f7eb248377b..d58d56b673b7 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -393,7 +393,7 @@ class Shift_ri funct3, string opcodestr> : RVInstIShift, - Sched<[WriteShift, ReadShift]>; + Sched<[WriteShiftImm, ReadShiftImm]>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class ALU_rr funct7, bits<3> funct3, string opcodestr> @@ -418,7 +418,7 @@ class ShiftW_ri funct3, string opcodestr> : RVInstIShiftW, - Sched<[WriteShift32, ReadShift32]>; + Sched<[WriteShiftImm32, ReadShiftImm32]>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in class ALUW_rr funct7, bits<3> funct3, string opcodestr> @@ -491,12 +491,12 @@ def SRAI : Shift_ri<1, 0b101, "srai">; def ADD : ALU_rr<0b0000000, 0b000, "add">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; def SUB : ALU_rr<0b0100000, 0b000, "sub">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; -def SLL : ALU_rr<0b0000000, 0b001, "sll">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; +def SLL : ALU_rr<0b0000000, 0b001, "sll">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>; def SLT : ALU_rr<0b0000000, 0b010, "slt">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; def SLTU : ALU_rr<0b0000000, 0b011, "sltu">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; def XOR : ALU_rr<0b0000000, 0b100, "xor">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; -def SRL : ALU_rr<0b0000000, 0b101, "srl">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; -def SRA : ALU_rr<0b0100000, 0b101, "sra">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; +def SRL : ALU_rr<0b0000000, 0b101, "srl">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>; +def SRA : ALU_rr<0b0100000, 0b101, "sra">, Sched<[WriteShiftReg, ReadShiftReg, ReadShiftReg]>; def OR : ALU_rr<0b0000000, 0b110, "or">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; def AND : ALU_rr<0b0000000, 0b111, "and">, Sched<[WriteIALU, ReadIALU, ReadIALU]>; @@ -578,11 +578,11 @@ def ADDW : ALUW_rr<0b0000000, 0b000, "addw">, def SUBW : ALUW_rr<0b0100000, 0b000, "subw">, Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>; def SLLW : ALUW_rr<0b0000000, 0b001, "sllw">, - Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>; + Sched<[WriteShiftReg32, ReadShiftReg32, ReadShiftReg32]>; def SRLW : ALUW_rr<0b0000000, 0b101, "srlw">, - Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>; + Sched<[WriteShiftReg32, ReadShiftReg32, ReadShiftReg32]>; def SRAW : ALUW_rr<0b0100000, 0b101, "sraw">, - Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>; + Sched<[WriteShiftReg32, ReadShiftReg32, ReadShiftReg32]>; } // Predicates = [IsRV64] //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td index 232b2e05f40e..86f96c1529b1 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td @@ -435,9 +435,9 @@ def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd), } def C_SRLI : Shift_right<0b00, "c.srli", GPRC, uimmlog2xlennonzero>, - Sched<[WriteShift, ReadShift]>; + Sched<[WriteShiftImm, ReadShiftImm]>; def C_SRAI : Shift_right<0b01, "c.srai", GPRC, uimmlog2xlennonzero>, - Sched<[WriteShift, ReadShift]>; + Sched<[WriteShiftImm, ReadShiftImm]>; let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def C_ANDI : RVInst16CB<0b100, 0b01, (outs GPRC:$rs1_wb), (ins GPRC:$rs1, simm6:$imm), @@ -480,7 +480,7 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb), (ins GPRNoX0:$rd, uimmlog2xlennonzero:$imm), "c.slli", "$rd, $imm">, - Sched<[WriteShift, ReadShift]> { + Sched<[WriteShiftImm, ReadShiftImm]> { let Constraints = "$rd = $rd_wb"; let Inst{6-2} = imm{4-0}; } @@ -653,7 +653,7 @@ def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rs1_wb), def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb), (ins GPRX0:$rd, uimmlog2xlennonzero:$imm), "c.slli", "$rd, $imm">, - Sched<[WriteShift, ReadShift]> { + Sched<[WriteShiftImm, ReadShiftImm]> { let Constraints = "$rd = $rd_wb"; let Inst{6-2} = imm{4-0}; let Inst{11-7} = 0; @@ -662,7 +662,7 @@ def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb), def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd), "c.slli64", "$rd">, - Sched<[WriteShift, ReadShift]> { + Sched<[WriteShiftImm, ReadShiftImm]> { let Constraints = "$rd = $rd_wb"; let Inst{6-2} = 0; let Inst{12} = 0; @@ -671,7 +671,7 @@ def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd), def C_SRLI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb), (ins GPRC:$rd), "c.srli64", "$rd">, - Sched<[WriteShift, ReadShift]> { + Sched<[WriteShiftImm, ReadShiftImm]> { let Constraints = "$rd = $rd_wb"; let Inst{6-2} = 0; let Inst{11-10} = 0; @@ -681,7 +681,7 @@ def C_SRLI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb), def C_SRAI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb), (ins GPRC:$rd), "c.srai64", "$rd">, - Sched<[WriteShift, ReadShift]> { + Sched<[WriteShiftImm, ReadShiftImm]> { let Constraints = "$rd = $rd_wb"; let Inst{6-2} = 0; let Inst{11-10} = 1; diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td index de2cdf512e87..68e5dba94a09 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td +++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td @@ -52,8 +52,10 @@ def : WriteRes; // Integer arithmetic and logic def : WriteRes; def : WriteRes; -def : WriteRes; -def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; // Integer multiplication let Latency = 4 in { @@ -181,8 +183,10 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index e57ba4f61b98..5e3b731b9774 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -45,8 +45,10 @@ def : WriteRes; let Latency = 3 in { def : WriteRes; def : WriteRes; -def : WriteRes; -def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; } // Integer multiplication @@ -170,8 +172,10 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; -def : ReadAdvance; -def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td index 0806be8a8d87..0af4d49f5cf1 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedule.td +++ b/llvm/lib/Target/RISCV/RISCVSchedule.td @@ -9,8 +9,10 @@ /// Define scheduler resources associated with def operands. def WriteIALU : SchedWrite; // 32 or 64-bit integer ALU operations def WriteIALU32 : SchedWrite; // 32-bit integer ALU operations on RV64I -def WriteShift32 : SchedWrite; // 32-bit shift operations on RV64Ix -def WriteShift : SchedWrite; // 32 or 64-bit shift operations +def WriteShiftImm : SchedWrite; // 32 or 64-bit shift by immediate operations +def WriteShiftImm32 : SchedWrite; // 32-bit shift by immediate operations on RV64Ix +def WriteShiftReg : SchedWrite; // 32 or 64-bit shift by immediate operations +def WriteShiftReg32 : SchedWrite; // 32-bit shift by immediate operations on RV64Ix def WriteIDiv : SchedWrite; // 32-bit or 64-bit divide and remainder def WriteIDiv32 : SchedWrite; // 32-bit divide and remainder on RV64I def WriteIMul : SchedWrite; // 32-bit or 64-bit multiply @@ -97,8 +99,10 @@ def ReadFMemBase : SchedRead; def ReadStoreData : SchedRead; def ReadIALU : SchedRead; def ReadIALU32 : SchedRead; // 32-bit integer ALU operations on RV64I -def ReadShift : SchedRead; -def ReadShift32 : SchedRead; // 32-bit shift operations on RV64Ix +def ReadShiftImm : SchedRead; +def ReadShiftImm32 : SchedRead; // 32-bit shift by immediate operations on RV64Ix +def ReadShiftReg : SchedRead; +def ReadShiftReg32 : SchedRead; // 32-bit shift by register operations on RV64Ix def ReadIDiv : SchedRead; def ReadIDiv32 : SchedRead; def ReadIMul : SchedRead; -- GitLab From ea48bf8649e12db8dc85078b001b9cc8d52a72b5 Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Fri, 19 Mar 2021 22:52:40 -0500 Subject: [PATCH 0235/1000] [PowerPC][NFC] Do not produce i64 constants in 32-bit mode There are some instances where we produce constants of type MVT::i64 unconditionally in the target DAG combines. This is not actually valid in 32-bit mode. --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 4 ++-- llvm/lib/Target/PowerPC/PPCInstrPrefix.td | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 4fa1689a77c4..5e004c4522b3 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -10111,7 +10111,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, WideVec, DAG.getConstant(Subtarget.isLittleEndian() ? NumVecs - 1 - VecNo : VecNo, - dl, MVT::i64)); + dl, getPointerTy(DAG.getDataLayout()))); RetOps.push_back(Extract); } return DAG.getMergeValues(RetOps, dl); @@ -10395,7 +10395,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op, for (unsigned Idx = 0; Idx < NumVecs; ++Idx) { unsigned VecNum = Subtarget.isLittleEndian() ? NumVecs - 1 - Idx : Idx; SDValue Elt = DAG.getNode(PPCISD::EXTRACT_VSX_REG, dl, MVT::v16i8, Value, - DAG.getConstant(VecNum, dl, MVT::i64)); + DAG.getConstant(VecNum, dl, getPointerTy(DAG.getDataLayout()))); SDValue Store = DAG.getStore(StoreChain, dl, Elt, BasePtr, SN->getPointerInfo().getWithOffset(Idx * 16), diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td index 5981bca37208..7f12a404dc04 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td +++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td @@ -18,10 +18,10 @@ def SDT_PPCPairBuild : SDTypeProfile<1, 2, [ SDTCisVT<0, v256i1>, SDTCisVT<1, v4i32>, SDTCisVT<2, v4i32> ]>; def SDT_PPCAccExtractVsx : SDTypeProfile<1, 2, [ - SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisInt<2> + SDTCisVT<0, v4i32>, SDTCisVT<1, v512i1>, SDTCisPtrTy<2> ]>; def SDT_PPCPairExtractVsx : SDTypeProfile<1, 2, [ - SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisInt<2> + SDTCisVT<0, v4i32>, SDTCisVT<1, v256i1>, SDTCisPtrTy<2> ]>; def SDT_PPCxxmfacc : SDTypeProfile<1, 1, [ SDTCisVT<0, v512i1>, SDTCisVT<1, v512i1> @@ -1608,13 +1608,13 @@ let Predicates = [MMA] in { v16i8:$vs3, v16i8:$vs2)), (XXMTACC Concats.VecsToVecQuad)>; def : Pat<(v512i1 (PPCxxmfacc v512i1:$AS)), (XXMFACC acc:$AS)>; - def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 0))), + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 0)), Extracts.Vec0>; - def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 1))), + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 1)), Extracts.Vec1>; - def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 2))), + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 2)), Extracts.Vec2>; - def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, (i64 3))), + def : Pat<(v4i32 (PPCAccExtractVsx acc:$v, 3)), Extracts.Vec3>; } @@ -1623,9 +1623,9 @@ let Predicates = [PairedVectorMemops] in { Concats.VecsToVecPair0>; def : Pat<(v256i1 (int_ppc_vsx_assemble_pair v16i8:$vs1, v16i8:$vs0)), Concats.VecsToVecPair0>; - def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 0))), + def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 0)), (v4i32 (EXTRACT_SUBREG $v, sub_vsx0))>; - def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, (i64 1))), + def : Pat<(v4i32 (PPCPairExtractVsx vsrpevenrc:$v, 1)), (v4i32 (EXTRACT_SUBREG $v, sub_vsx1))>; } -- GitLab From cdb6eb7e8372027e74d6b0fb1258fff37e2b3b5a Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Sat, 20 Mar 2021 01:23:12 +0000 Subject: [PATCH 0236/1000] Update syntax for amx.tile_muli to use two Unit attr to mark the zext case This makes the annotation tied to the operand and the use of a keyword more explicit/readable on what it means. Differential Revision: https://reviews.llvm.org/D99001 --- mlir/include/mlir/Dialect/AMX/AMX.td | 14 ++++++++------ mlir/lib/Dialect/AMX/IR/AMXDialect.cpp | 2 -- .../AMX/Transforms/LegalizeForLLVMExport.cpp | 4 ++-- mlir/test/Dialect/AMX/invalid.mlir | 10 ---------- mlir/test/Dialect/AMX/legalize-for-llvm.mlir | 8 ++++---- mlir/test/Dialect/AMX/roundtrip.mlir | 12 ++++++++++-- .../Dialect/Vector/CPU/AMX/test-muli-ext.mlir | 8 ++++---- .../Dialect/Vector/CPU/AMX/test-muli.mlir | 4 ++-- 8 files changed, 30 insertions(+), 32 deletions(-) diff --git a/mlir/include/mlir/Dialect/AMX/AMX.td b/mlir/include/mlir/Dialect/AMX/AMX.td index 45c63a99e670..24052ed4f24d 100644 --- a/mlir/include/mlir/Dialect/AMX/AMX.td +++ b/mlir/include/mlir/Dialect/AMX/AMX.td @@ -196,14 +196,14 @@ def TileMulIOp : AMX_Op<"tile_muli", [NoSideEffect, AllTypesMatch<["acc", "res"] into a "m x n" destination tile. Supports all "si32 <- s/ui8 x s/ui8" combinations (4 bytes packed into dwords in the columns of both the source operand tiles; the zero or sign extension is specified with - the attributes). The operation is eventually lowered into one of - the "tdpbssd", "tdpbsud", "tdpbusd", or "tdpbuud" instructions with - the corresponding tile configuration. + the attributes and default to sign extended). The operation is eventually + lowered into one of the "tdpbssd", "tdpbsud", "tdpbusd", or "tdpbuud" + instructions with the corresponding tile configuration. Example: ```mlir - %0 = amx.tile_muli %a, %b, %c [true, true] + %0 = amx.tile_muli %a zext, %b zext, %c : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> ``` }]; @@ -211,7 +211,9 @@ def TileMulIOp : AMX_Op<"tile_muli", [NoSideEffect, AllTypesMatch<["acc", "res"] let arguments = (ins VectorOfRankAndType<[2], [I32, I8]>:$lhs, VectorOfRankAndType<[2], [I32, I8]>:$rhs, VectorOfRankAndType<[2], [I32, I8]>:$acc, - BoolArrayAttr:$zext); + UnitAttr:$isZextLhs, + UnitAttr:$isZextRhs + ); let results = (outs VectorOfRankAndType<[2], [I32, I8]>:$res); let extraClassDeclaration = [{ VectorType getLhsVectorType() { @@ -224,7 +226,7 @@ def TileMulIOp : AMX_Op<"tile_muli", [NoSideEffect, AllTypesMatch<["acc", "res"] return res().getType().cast(); } }]; - let assemblyFormat = "$lhs `,` $rhs `,` $acc $zext attr-dict `:` " + let assemblyFormat = "$lhs (`zext` $isZextLhs^)? `,` $rhs (`zext` $isZextRhs^)? `,` $acc attr-dict `:` " "type($lhs) `,` type($rhs) `,` type($acc) "; } diff --git a/mlir/lib/Dialect/AMX/IR/AMXDialect.cpp b/mlir/lib/Dialect/AMX/IR/AMXDialect.cpp index 5ebef7efe213..ab98820b2ecb 100644 --- a/mlir/lib/Dialect/AMX/IR/AMXDialect.cpp +++ b/mlir/lib/Dialect/AMX/IR/AMXDialect.cpp @@ -85,8 +85,6 @@ static LogicalResult verify(amx::TileMulFOp op) { } static LogicalResult verify(amx::TileMulIOp op) { - if (op.zext().size() != 2) - return op.emitOpError("unexpected zext length"); VectorType aType = op.getLhsVectorType(); VectorType bType = op.getRhsVectorType(); VectorType cType = op.getVectorType(); diff --git a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp index 6e082ce790fc..7db57d383ba3 100644 --- a/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp +++ b/mlir/lib/Dialect/AMX/Transforms/LegalizeForLLVMExport.cpp @@ -191,8 +191,8 @@ struct TileMulIConversion : public ConvertOpToLLVMPattern { getTileSizes(rewriter, *getTypeConverter(), bType, op.getLoc()); // Replace operation with intrinsic. Type resType = typeConverter->convertType(cType); - bool zexta = op.zext()[0].cast().getValue(); - bool zextb = op.zext()[1].cast().getValue(); + bool zexta = op.isZextLhs(); + bool zextb = op.isZextRhs(); if (zexta && zextb) rewriter.replaceOpWithNewOp( op, resType, tsza.first, tszb.second, tsza.second, adaptor.acc(), diff --git a/mlir/test/Dialect/AMX/invalid.mlir b/mlir/test/Dialect/AMX/invalid.mlir index b3a7286b526a..6f147cf2851e 100644 --- a/mlir/test/Dialect/AMX/invalid.mlir +++ b/mlir/test/Dialect/AMX/invalid.mlir @@ -46,13 +46,3 @@ func @multsize() { // expected-error@+1 {{'amx.tile_mulf' op bad mult shape: 4 x 4 x 4}} %3 = amx.tile_mulf %0, %1, %2 : vector<8x8xbf16>, vector<8x8xbf16>, vector<4x4xf32> } - -// ----- - -func @zextsize() { - %0 = amx.tile_zero : vector<8x8xi8> - %1 = amx.tile_zero : vector<8x8xi8> - %2 = amx.tile_zero : vector<8x8xi32> - // expected-error@+1 {{'amx.tile_muli' op unexpected zext length}} - %3 = amx.tile_muli %0, %1, %2 [true] : vector<8x8xi8>, vector<8x8xi8>, vector<8x8xi32> -} diff --git a/mlir/test/Dialect/AMX/legalize-for-llvm.mlir b/mlir/test/Dialect/AMX/legalize-for-llvm.mlir index f88d83d8f311..37382b34972d 100644 --- a/mlir/test/Dialect/AMX/legalize-for-llvm.mlir +++ b/mlir/test/Dialect/AMX/legalize-for-llvm.mlir @@ -17,13 +17,13 @@ func @muli(%arg0: memref, %arg1: memref) { %1 = amx.tile_zero : vector<16x64xi8> %2 = amx.tile_load %arg0[%0, %0] : memref into vector<16x64xi8> %3 = amx.tile_load %arg1[%0, %0] : memref into vector<16x16xi32> - %4 = amx.tile_muli %1, %2, %3 [true, true] : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> + %4 = amx.tile_muli %1 zext, %2 zext, %3 : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> amx.tile_store %arg1[%0, %0], %4 : memref, vector<16x16xi32> - %5 = amx.tile_muli %1, %2, %3 [false, false] : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> + %5 = amx.tile_muli %1, %2, %3 : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> amx.tile_store %arg1[%0, %0], %5 : memref, vector<16x16xi32> - %6 = amx.tile_muli %1, %2, %3 [true, false] : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> + %6 = amx.tile_muli %1 zext, %2, %3 : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> amx.tile_store %arg1[%0, %0], %6 : memref, vector<16x16xi32> - %7 = amx.tile_muli %1, %2, %3 [false, true] : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> + %7 = amx.tile_muli %1, %2 zext, %3 : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> amx.tile_store %arg1[%0, %0], %7 : memref, vector<16x16xi32> return } diff --git a/mlir/test/Dialect/AMX/roundtrip.mlir b/mlir/test/Dialect/AMX/roundtrip.mlir index 98b8024c194d..93f3ea4a2977 100644 --- a/mlir/test/Dialect/AMX/roundtrip.mlir +++ b/mlir/test/Dialect/AMX/roundtrip.mlir @@ -28,14 +28,22 @@ func @tmulf(%arg0: memref, %arg1: memref) { // CHECK: %[[x:.*]] = amx.tile_load %{{.*}}[%{{.*}}, %{{.*}}] : memref into vector<16x64xi8> // CHECK: %[[y:.*]] = amx.tile_load %{{.*}}[%{{.*}}, %{{.*}}] : memref into vector<16x64xi8> // CHECK: %[[z:.*]] = amx.tile_load %{{.*}}[%{{.*}}, %{{.*}}] : memref into vector<16x16xi32> -// CHECK: %[[m:.*]] = amx.tile_muli %[[x]], %[[y]], %[[z]] [true, true] : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> +// CHECK: %[[m:.*]] = amx.tile_muli %[[x]] zext, %[[y]] zext, %[[z]] : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> // CHECK: amx.tile_store %{{.*}}[%{{.*}}, %{{.*}}], %[[m]] : memref, vector<16x16xi32> +// Verify the parsing/printing of the sign-extension annotation. +// CHECK: amx.tile_muli %{{.*}}, %{{.*}} zext, %{{.*}} +// CHECK: amx.tile_muli %{{.*}} zext, %{{.*}}, %{{.*}} +// CHECK: amx.tile_muli %{{.*}}, %{{.*}}, %{{.*}} func @tmuli(%arg0: memref, %arg1: memref, %arg2: memref) { %0 = constant 0 : index %1 = amx.tile_load %arg0[%0, %0] : memref into vector<16x64xi8> %2 = amx.tile_load %arg1[%0, %0] : memref into vector<16x64xi8> %3 = amx.tile_load %arg2[%0, %0] : memref into vector<16x16xi32> - %4 = amx.tile_muli %1, %2, %3 [true, true] : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> + %4 = amx.tile_muli %1 zext, %2 zext, %3 : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> amx.tile_store %arg2[%0, %0], %4 : memref, vector<16x16xi32> + // Verify the various `zext` combinations. + %5 = amx.tile_muli %1, %2 zext, %3 : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> + %6 = amx.tile_muli %1 zext, %2, %3 : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> + %7 = amx.tile_muli %1, %2, %3 : vector<16x64xi8>, vector<16x64xi8>, vector<16x16xi32> return } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-ext.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-ext.mlir index dee283c68212..45e9816fa9d6 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-ext.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli-ext.mlir @@ -24,7 +24,7 @@ func @kernel1(%arg0: memref<16x16xi8>, %1 = amx.tile_load %arg0[%0, %0] : memref<16x16xi8> into vector<16x16xi8> %2 = amx.tile_load %arg1[%0, %0] : memref<4x16xi8> into vector<4x16xi8> %3 = amx.tile_zero : vector<16x4xi32> - %4 = amx.tile_muli %1, %2, %3 [false, false] : vector<16x16xi8>, vector<4x16xi8>, vector<16x4xi32> + %4 = amx.tile_muli %1, %2, %3 : vector<16x16xi8>, vector<4x16xi8>, vector<16x4xi32> amx.tile_store %arg2[%0, %0], %4 : memref<16x4xi32>, vector<16x4xi32> return } @@ -36,7 +36,7 @@ func @kernel2(%arg0: memref<16x16xi8>, %1 = amx.tile_load %arg0[%0, %0] : memref<16x16xi8> into vector<16x16xi8> %2 = amx.tile_load %arg1[%0, %0] : memref<4x16xi8> into vector<4x16xi8> %3 = amx.tile_zero : vector<16x4xi32> - %4 = amx.tile_muli %1, %2, %3 [false, true] : vector<16x16xi8>, vector<4x16xi8>, vector<16x4xi32> + %4 = amx.tile_muli %1, %2 zext, %3 : vector<16x16xi8>, vector<4x16xi8>, vector<16x4xi32> amx.tile_store %arg2[%0, %0], %4 : memref<16x4xi32>, vector<16x4xi32> return } @@ -48,7 +48,7 @@ func @kernel3(%arg0: memref<16x16xi8>, %1 = amx.tile_load %arg0[%0, %0] : memref<16x16xi8> into vector<16x16xi8> %2 = amx.tile_load %arg1[%0, %0] : memref<4x16xi8> into vector<4x16xi8> %3 = amx.tile_zero : vector<16x4xi32> - %4 = amx.tile_muli %1, %2, %3 [true, false] : vector<16x16xi8>, vector<4x16xi8>, vector<16x4xi32> + %4 = amx.tile_muli %1 zext, %2, %3 : vector<16x16xi8>, vector<4x16xi8>, vector<16x4xi32> amx.tile_store %arg2[%0, %0], %4 : memref<16x4xi32>, vector<16x4xi32> return } @@ -60,7 +60,7 @@ func @kernel4(%arg0: memref<16x16xi8>, %1 = amx.tile_load %arg0[%0, %0] : memref<16x16xi8> into vector<16x16xi8> %2 = amx.tile_load %arg1[%0, %0] : memref<4x16xi8> into vector<4x16xi8> %3 = amx.tile_zero : vector<16x4xi32> - %4 = amx.tile_muli %1, %2, %3 [true, true] : vector<16x16xi8>, vector<4x16xi8>, vector<16x4xi32> + %4 = amx.tile_muli %1 zext, %2 zext, %3 : vector<16x16xi8>, vector<4x16xi8>, vector<16x4xi32> amx.tile_store %arg2[%0, %0], %4 : memref<16x4xi32>, vector<16x4xi32> return } diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli.mlir index a52f66c640f8..df848a04eae7 100644 --- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli.mlir +++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/test-muli.mlir @@ -13,7 +13,7 @@ func @kernel1(%arg0: memref<2x8xi8>, %1 = amx.tile_load %arg0[%0, %0] : memref<2x8xi8> into vector<2x8xi8> %2 = amx.tile_load %arg1[%0, %0] : memref<2x8xi8> into vector<2x8xi8> %3 = amx.tile_zero : vector<2x2xi32> - %4 = amx.tile_muli %1, %2, %3 [true, true] : vector<2x8xi8>, vector<2x8xi8>, vector<2x2xi32> + %4 = amx.tile_muli %1 zext, %2 zext, %3 : vector<2x8xi8>, vector<2x8xi8>, vector<2x2xi32> amx.tile_store %arg2[%0, %0], %4 : memref<2x2xi32>, vector<2x2xi32> return } @@ -26,7 +26,7 @@ func @kernel2(%arg0: memref<2x8xi8>, %1 = amx.tile_load %arg0[%0, %0] : memref<2x8xi8> into vector<2x8xi8> %2 = amx.tile_load %arg1[%0, %0] : memref<2x8xi8> into vector<2x8xi8> %3 = amx.tile_load %arg2[%0, %0] : memref<2x2xi32> into vector<2x2xi32> - %4 = amx.tile_muli %1, %2, %3 [true, true] : vector<2x8xi8>, vector<2x8xi8>, vector<2x2xi32> + %4 = amx.tile_muli %1 zext, %2 zext, %3 : vector<2x8xi8>, vector<2x8xi8>, vector<2x2xi32> amx.tile_store %arg2[%0, %0], %4 : memref<2x2xi32>, vector<2x2xi32> return } -- GitLab From 3d155157bf621effd51e3f62050d488572a11501 Mon Sep 17 00:00:00 2001 From: Siva Chandra Date: Sat, 20 Mar 2021 04:12:33 +0000 Subject: [PATCH 0237/1000] [libc] Use add_library in add_entrypoint_library instead of invoking ar. --- libc/cmake/modules/LLVMLibCLibraryRules.cmake | 118 +++++++----------- 1 file changed, 46 insertions(+), 72 deletions(-) diff --git a/libc/cmake/modules/LLVMLibCLibraryRules.cmake b/libc/cmake/modules/LLVMLibCLibraryRules.cmake index 21a99a0dd0a9..bdc361a719de 100644 --- a/libc/cmake/modules/LLVMLibCLibraryRules.cmake +++ b/libc/cmake/modules/LLVMLibCLibraryRules.cmake @@ -1,70 +1,42 @@ -# This is a helper function and not a build rule. It is to be used by the -# the "add_entrypoint_library" rule to generate the full list of object files -# recursively produced by "add_object_library" targets upstream in the -# dependency tree. This function traverses up through the -# "add_entrypoint_object" targets but does not collect the object files -# produced by them. -# Usage: -# get_object_files_for_test( [ ...]) -# -# targetN is either an "add_entrypoint_target" target or an -# "add_object_library" target. -function(get_object_files_for_entrypoint_library result) - set(object_files "") - foreach(dep IN LISTS ARGN) - get_target_property(dep_type ${dep} "TARGET_TYPE") - if (NOT dep_type) - continue() - endif() - - if(${dep_type} STREQUAL ${OBJECT_LIBRARY_TARGET_TYPE}) - get_target_property(dep_object_files ${dep} "OBJECT_FILES") - if(dep_object_files) - list(APPEND object_files ${dep_object_files}) - endif() - endif() - - get_target_property(indirect_deps ${dep} "DEPS") - get_object_files_for_entrypoint_library(indirect_objfiles ${indirect_deps}) - list(APPEND object_files ${indirect_objfiles}) - endforeach(dep) - list(REMOVE_DUPLICATES object_files) - set(${result} ${object_files} PARENT_SCOPE) -endfunction() - -# This is a helper function and not a build rule. Given an entrypoint object -# target, it returns the object file produced by this target in |result|. -# If the given entrypoint target is an alias, then it traverses up to the -# aliasee to get the object file. -function(get_entrypoint_object_file entrypoint_target result) - get_target_property(target_type ${entrypoint_target} "TARGET_TYPE") - if(NOT (${target_type} STREQUAL ${ENTRYPOINT_OBJ_TARGET_TYPE})) - message(FATAL_ERROR - "Expected an target added using `add_entrypoint_object` rule.") +function(collect_object_file_deps target result) + set(all_deps "") + get_target_property(target_type ${target} "TARGET_TYPE") + if(NOT target_type) + return() endif() - get_target_property(objfile ${entrypoint_target} "OBJECT_FILE") - if(objfile) - set(${result} ${objfile} PARENT_SCOPE) + if(${target_type} STREQUAL ${OBJECT_LIBRARY_TARGET_TYPE}) + list(APPEND all_deps ${target}) + get_target_property(deps ${target} "DEPS") + foreach(dep IN LISTS deps) + collect_object_file_deps(${dep} dep_targets) + list(APPEND all_deps ${dep_targets}) + endforeach(dep) + set(${result} ${all_deps} PARENT_SCOPE) return() endif() - # If the entrypoint is an alias, fetch the object file from the aliasee. - get_target_property(is_alias ${entrypoint_target} "IS_ALIAS") - if(is_alias) - get_target_property(aliasee ${entrypoint_target} "DEPS") - if(NOT aliasee) - message(FATAL_ERROR - "Entrypoint alias ${entrypoint_target} does not have an aliasee.") + if(${target_type} STREQUAL ${ENTRYPOINT_OBJ_TARGET_TYPE}) + set(entrypoint_target ${target}) + get_target_property(is_alias ${entrypoint_target} "IS_ALIAS") + if(is_alias) + get_target_property(aliasee ${entrypoint_target} "DEPS") + if(NOT aliasee) + message(FATAL_ERROR + "Entrypoint alias ${entrypoint_target} does not have an aliasee.") + endif() + set(entrypoint_target ${aliasee}) endif() - get_entrypoint_object_file(${aliasee} objfile) - set(${result} ${objfile} PARENT_SCOPE) + list(APPEND all_deps ${entrypoint_target}) + get_target_property(deps ${target} "DEPS") + foreach(dep IN LISTS deps) + collect_object_file_deps(${dep} dep_targets) + list(APPEND all_deps ${dep_targets}) + endforeach(dep) + set(${result} ${all_deps} PARENT_SCOPE) return() endif() - - message(FATAL_ERROR - "Entrypoint ${entrypoint_target} does not produce an object file.") -endfunction(get_entrypoint_object_file) +endfunction(collect_object_file_deps) # A rule to build a library from a collection of entrypoint objects. # Usage: @@ -89,28 +61,30 @@ function(add_entrypoint_library target_name) endif() get_fq_deps_list(fq_deps_list ${ENTRYPOINT_LIBRARY_DEPENDS}) - get_object_files_for_entrypoint_library(obj_list ${fq_deps_list}) + set(all_deps "") foreach(dep IN LISTS fq_deps_list) get_target_property(dep_type ${dep} "TARGET_TYPE") if(NOT (${dep_type} STREQUAL ${ENTRYPOINT_OBJ_TARGET_TYPE})) message(FATAL_ERROR "Dependency '${dep}' of 'add_entrypoint_collection' is " "not an 'add_entrypoint_object' target.") endif() - get_entrypoint_object_file(${dep} objfile) - list(APPEND obj_list ${objfile}) + collect_object_file_deps(${dep} recursive_deps) + list(APPEND all_deps ${recursive_deps}) endforeach(dep) - list(REMOVE_DUPLICATES obj_list) - - set(library_file "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${target_name}${CMAKE_STATIC_LIBRARY_SUFFIX}") - add_custom_command( - OUTPUT ${library_file} - COMMAND ${CMAKE_AR} -r ${library_file} ${obj_list} - DEPENDS ${obj_list} + list(REMOVE_DUPLICATES all_deps) + set(objects "") + foreach(dep IN LISTS all_deps) + list(APPEND objects $) + endforeach(dep) + add_library( + ${target_name} + STATIC + ${objects} ) - add_custom_target( + set_target_properties( ${target_name} - ALL - DEPENDS ${library_file} + PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endfunction(add_entrypoint_library) -- GitLab From 6c9cac5da1d1c64e2df4e9c7ea21355a34595a8a Mon Sep 17 00:00:00 2001 From: Carl Ritson Date: Sat, 20 Mar 2021 13:38:26 +0900 Subject: [PATCH 0238/1000] [AMDGPU] Add MDT update missing from D98915 --- llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp index 42cc09f8e484..3fb96f15313d 100644 --- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp @@ -207,6 +207,7 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) { } MBB->addSuccessor(EmptyMBBAtEnd); + MDT->getBase().insertEdge(MBB, EmptyMBBAtEnd); BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(AMDGPU::S_BRANCH)) .addMBB(EmptyMBBAtEnd); MI->eraseFromParent(); -- GitLab From e990fa2170314b179ec025b68fd00fbe9aab398d Mon Sep 17 00:00:00 2001 From: Rob Suderman Date: Fri, 19 Mar 2021 17:47:39 -0700 Subject: [PATCH 0239/1000] [mlir][tosa] Add tosa.reverse lowering to linalg.generic Reverse lowers to a linalg.generic op by reversing the read order in the index map. Differential Revision: https://reviews.llvm.org/D98997 --- .../Conversion/TosaToLinalg/TosaToLinalg.cpp | 60 +++++++++++++++++-- .../TosaToLinalg/tosa-to-linalg.mlir | 23 +++++++ 2 files changed, 78 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp index 72b9aa850213..fc831162b104 100644 --- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp +++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp @@ -585,7 +585,7 @@ public: } }; -class ReshapeOpConverter : public OpConversionPattern { +class ReshapeConverter : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -727,7 +727,7 @@ public: } }; -class RescaleOpConverter : public OpRewritePattern { +class RescaleConverter : public OpRewritePattern { public: using OpRewritePattern::OpRewritePattern; @@ -889,7 +889,7 @@ public: } }; -struct ConcatOpConversion : public OpConversionPattern { +struct ConcatConverter : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult @@ -936,6 +936,56 @@ struct ConcatOpConversion : public OpConversionPattern { } }; +class ReverseConverter : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(tosa::ReverseOp op, + PatternRewriter &rewriter) const final { + auto loc = op.getLoc(); + Value input = op.input(); + auto inputTy = input.getType().template cast(); + auto resultTy = op.getType().template cast(); + auto rank = resultTy.getRank(); + auto axis = op.axis(); + + if (!inputTy.hasStaticShape()) + return rewriter.notifyMatchFailure( + op, "No initial value found for reduction operation"); + + // First fill the output buffer with the init value. + auto initTensor = rewriter + .create( + loc, ArrayRef({}), inputTy.getShape(), + inputTy.getElementType()) + .result(); + + SmallVector inputExprs; + inputExprs.resize(resultTy.getRank()); + + for (int i = 0; i < rank; i++) + inputExprs[i] = rewriter.getAffineDimExpr(i); + + inputExprs[axis] = + rewriter.getAffineConstantExpr(inputTy.getDimSize(axis) - 1) - + inputExprs[axis]; + + SmallVector affineMaps = { + AffineMap::get(resultTy.getRank(), /*symbolCount=*/0, inputExprs, + rewriter.getContext()), + rewriter.getMultiDimIdentityMap(resultTy.getRank())}; + + rewriter.replaceOpWithNewOp( + op, resultTy, op.input(), ValueRange{initTensor}, affineMaps, + getNParallelLoopsAttrs(resultTy.getRank()), + [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) { + nestedBuilder.create(op.getLoc(), *args.begin()); + }); + + return success(); + } +}; + } // namespace void mlir::tosa::populateTosaToLinalgOnTensorsConversionPatterns( @@ -963,6 +1013,6 @@ void mlir::tosa::populateTosaToLinalgOnTensorsConversionPatterns( IdentityNConverter, IdentityNConverter, ReduceConverter, ReduceConverter, ReduceConverter, - ReduceConverter, ConcatOpConversion, - ReshapeOpConverter, TransposeConverter, RescaleOpConverter>(context); + ReduceConverter, ConcatConverter, ReshapeConverter, + RescaleConverter, ReverseConverter, TransposeConverter>(context); } diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir index f25eb3f346ba..c41770b105ba 100644 --- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir +++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir @@ -598,3 +598,26 @@ func @rescaleUnnecessaryDoubleRound(%arg0 : tensor<1xi8>) -> (tensor<1xi8>) { %0 = "tosa.rescale"(%arg0) {input_zp = 243 : i32, output_zp = 252 : i32, multiplier = [19689 : i32], shift = [15 : i32], scale32 = true, double_round = true, per_channel = false} : (tensor<1xi8>) -> (tensor<1xi8>) return %0 : tensor<1xi8> } + +// ----- + +// CHECK: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (-d0 + 4, d1)> +// CHECK: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0, d1)> +// CHECK: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 3)> + +// CHECK-LABEL: @reverse +func @reverse(%arg0: tensor<5x4xi32>) -> () { + // CHECK: [[INIT:%.+]] = linalg.init_tensor [5, 4] + // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<5x4xi32>) outs([[INIT]] : tensor<5x4xi32>) { + // CHECK: ^bb0(%arg1: i32, %arg2: i32): + // CHECK: linalg.yield %arg1 : i32 + %0 = "tosa.reverse"(%arg0) {axis = 0 : i64} : (tensor<5x4xi32>) -> tensor<5x4xi32> + + // CHECK: [[INIT:%.+]] = linalg.init_tensor [5, 4] + // CHECK: [[GENERIC:%.+]] = linalg.generic {indexing_maps = [#[[$MAP2]], #[[$MAP1]]], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<5x4xi32>) outs([[INIT]] : tensor<5x4xi32>) { + // CHECK: ^bb0(%arg1: i32, %arg2: i32): + // CHECK: linalg.yield %arg1 : i32 + %1 = "tosa.reverse"(%arg0) {axis = 1 : i64} : (tensor<5x4xi32>) -> tensor<5x4xi32> + + return +} -- GitLab From 4d11baab25a840220267c0c9eceee6411a609a14 Mon Sep 17 00:00:00 2001 From: Shao-Ce Sun Date: Sat, 20 Mar 2021 13:43:07 +0800 Subject: [PATCH 0240/1000] [NFC][ValueTypes] Align code by column Adjusted some whitespaces. Reviewed By: craig.topper Differential Revision: https://reviews.llvm.org/D98975 --- llvm/include/llvm/CodeGen/ValueTypes.td | 352 +++++++++---------- llvm/include/llvm/Support/MachineValueType.h | 56 +-- 2 files changed, 204 insertions(+), 204 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td index 1ac9e47cab69..775ce448226a 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.td +++ b/llvm/include/llvm/CodeGen/ValueTypes.td @@ -18,121 +18,121 @@ class ValueType { int Value = value; } -def OtherVT: ValueType<0 , 1>; // "Other" value -def i1 : ValueType<1 , 2>; // One bit boolean value -def i8 : ValueType<8 , 3>; // 8-bit integer value -def i16 : ValueType<16 , 4>; // 16-bit integer value -def i32 : ValueType<32 , 5>; // 32-bit integer value -def i64 : ValueType<64 , 6>; // 64-bit integer value -def i128 : ValueType<128, 7>; // 128-bit integer value - -def bf16 : ValueType<16 , 8>; // 16-bit brain floating point value -def f16 : ValueType<16 , 9>; // 16-bit floating point value -def f32 : ValueType<32 , 10>; // 32-bit floating point value -def f64 : ValueType<64 , 11>; // 64-bit floating point value -def f80 : ValueType<80 , 12>; // 80-bit floating point value -def f128 : ValueType<128, 13>; // 128-bit floating point value -def ppcf128: ValueType<128, 14>; // PPC 128-bit floating point value - -def v1i1 : ValueType<1 , 15>; // 1 x i1 vector value -def v2i1 : ValueType<2 , 16>; // 2 x i1 vector value -def v4i1 : ValueType<4 , 17>; // 4 x i1 vector value -def v8i1 : ValueType<8 , 18>; // 8 x i1 vector value -def v16i1 : ValueType<16, 19>; // 16 x i1 vector value -def v32i1 : ValueType<32 , 20>; // 32 x i1 vector value -def v64i1 : ValueType<64 , 21>; // 64 x i1 vector value -def v128i1 : ValueType<128, 22>; // 128 x i1 vector value -def v256i1 : ValueType<256, 23>; // 256 x i1 vector value -def v512i1 : ValueType<512, 24>; // 512 x i1 vector value -def v1024i1: ValueType<1024,25>; //1024 x i1 vector value - -def v1i8 : ValueType<8, 26>; // 1 x i8 vector value -def v2i8 : ValueType<16 , 27>; // 2 x i8 vector value -def v4i8 : ValueType<32 , 28>; // 4 x i8 vector value -def v8i8 : ValueType<64 , 29>; // 8 x i8 vector value -def v16i8 : ValueType<128, 30>; // 16 x i8 vector value -def v32i8 : ValueType<256, 31>; // 32 x i8 vector value -def v64i8 : ValueType<512, 32>; // 64 x i8 vector value -def v128i8 : ValueType<1024,33>; //128 x i8 vector value -def v256i8 : ValueType<2048,34>; //256 x i8 vector value - -def v1i16 : ValueType<16 , 35>; // 1 x i16 vector value -def v2i16 : ValueType<32 , 36>; // 2 x i16 vector value -def v3i16 : ValueType<48 , 37>; // 3 x i16 vector value -def v4i16 : ValueType<64 , 38>; // 4 x i16 vector value -def v8i16 : ValueType<128, 39>; // 8 x i16 vector value -def v16i16 : ValueType<256, 40>; // 16 x i16 vector value -def v32i16 : ValueType<512, 41>; // 32 x i16 vector value -def v64i16 : ValueType<1024,42>; // 64 x i16 vector value -def v128i16: ValueType<2048,43>; //128 x i16 vector value - -def v1i32 : ValueType<32 , 44>; // 1 x i32 vector value -def v2i32 : ValueType<64 , 45>; // 2 x i32 vector value -def v3i32 : ValueType<96 , 46>; // 3 x i32 vector value -def v4i32 : ValueType<128, 47>; // 4 x i32 vector value -def v5i32 : ValueType<160, 48>; // 5 x i32 vector value -def v8i32 : ValueType<256, 49>; // 8 x i32 vector value -def v16i32 : ValueType<512, 50>; // 16 x i32 vector value -def v32i32 : ValueType<1024,51>; // 32 x i32 vector value -def v64i32 : ValueType<2048,52>; // 64 x i32 vector value -def v128i32 : ValueType<4096,53>; // 128 x i32 vector value -def v256i32 : ValueType<8182,54>; // 256 x i32 vector value -def v512i32 : ValueType<16384,55>; // 512 x i32 vector value -def v1024i32 : ValueType<32768,56>; // 1024 x i32 vector value -def v2048i32 : ValueType<65536,57>; // 2048 x i32 vector value - -def v1i64 : ValueType<64 , 58>; // 1 x i64 vector value -def v2i64 : ValueType<128, 59>; // 2 x i64 vector value -def v4i64 : ValueType<256, 60>; // 4 x i64 vector value -def v8i64 : ValueType<512, 61>; // 8 x i64 vector value -def v16i64 : ValueType<1024,62>; // 16 x i64 vector value -def v32i64 : ValueType<2048,63>; // 32 x i64 vector value -def v64i64 : ValueType<4096,64>; // 64 x i64 vector value -def v128i64: ValueType<8192,65>; // 128 x i64 vector value -def v256i64: ValueType<16384,66>; // 256 x i64 vector value - -def v1i128 : ValueType<128, 67>; // 1 x i128 vector value - -def v1f16 : ValueType<16 , 68>; // 1 x f16 vector value -def v2f16 : ValueType<32 , 69>; // 2 x f16 vector value -def v3f16 : ValueType<48 , 70>; // 3 x f16 vector value -def v4f16 : ValueType<64 , 71>; // 4 x f16 vector value -def v8f16 : ValueType<128, 72>; // 8 x f16 vector value -def v16f16 : ValueType<256, 73>; // 16 x f16 vector value -def v32f16 : ValueType<512, 74>; // 32 x f16 vector value -def v64f16 : ValueType<1024, 75>; // 64 x f16 vector value -def v128f16 : ValueType<2048, 76>; // 128 x f16 vector value -def v2bf16 : ValueType<32 , 77>; // 2 x bf16 vector value -def v3bf16 : ValueType<48 , 78>; // 3 x bf16 vector value -def v4bf16 : ValueType<64 , 79>; // 4 x bf16 vector value -def v8bf16 : ValueType<128, 80>; // 8 x bf16 vector value -def v16bf16 : ValueType<256, 81>; // 16 x bf16 vector value -def v32bf16 : ValueType<512, 82>; // 32 x bf16 vector value -def v64bf16 : ValueType<1024, 83>; // 64 x bf16 vector value -def v128bf16 : ValueType<2048, 84>; // 128 x bf16 vector value -def v1f32 : ValueType<32 , 85>; // 1 x f32 vector value -def v2f32 : ValueType<64 , 86>; // 2 x f32 vector value -def v3f32 : ValueType<96 , 87>; // 3 x f32 vector value -def v4f32 : ValueType<128, 88>; // 4 x f32 vector value -def v5f32 : ValueType<160, 89>; // 5 x f32 vector value -def v8f32 : ValueType<256, 90>; // 8 x f32 vector value -def v16f32 : ValueType<512, 91>; // 16 x f32 vector value -def v32f32 : ValueType<1024, 92>; // 32 x f32 vector value -def v64f32 : ValueType<2048, 93>; // 64 x f32 vector value -def v128f32 : ValueType<4096, 94>; // 128 x f32 vector value -def v256f32 : ValueType<8182, 95>; // 256 x f32 vector value -def v512f32 : ValueType<16384, 96>; // 512 x f32 vector value -def v1024f32 : ValueType<32768, 97>; // 1024 x f32 vector value -def v2048f32 : ValueType<65536, 98>; // 2048 x f32 vector value -def v1f64 : ValueType<64, 99>; // 1 x f64 vector value -def v2f64 : ValueType<128, 100>; // 2 x f64 vector value -def v4f64 : ValueType<256, 101>; // 4 x f64 vector value -def v8f64 : ValueType<512, 102>; // 8 x f64 vector value -def v16f64 : ValueType<1024, 103>; // 16 x f64 vector value -def v32f64 : ValueType<2048, 104>; // 32 x f64 vector value -def v64f64 : ValueType<4096, 105>; // 64 x f64 vector value -def v128f64 : ValueType<8192, 106>; // 128 x f64 vector value -def v256f64 : ValueType<16384, 107>; // 256 x f64 vector value +def OtherVT : ValueType<0, 1>; // "Other" value +def i1 : ValueType<1, 2>; // One bit boolean value +def i8 : ValueType<8, 3>; // 8-bit integer value +def i16 : ValueType<16, 4>; // 16-bit integer value +def i32 : ValueType<32, 5>; // 32-bit integer value +def i64 : ValueType<64, 6>; // 64-bit integer value +def i128 : ValueType<128, 7>; // 128-bit integer value + +def bf16 : ValueType<16, 8>; // 16-bit brain floating point value +def f16 : ValueType<16, 9>; // 16-bit floating point value +def f32 : ValueType<32, 10>; // 32-bit floating point value +def f64 : ValueType<64, 11>; // 64-bit floating point value +def f80 : ValueType<80, 12>; // 80-bit floating point value +def f128 : ValueType<128, 13>; // 128-bit floating point value +def ppcf128 : ValueType<128, 14>; // PPC 128-bit floating point value + +def v1i1 : ValueType<1, 15>; // 1 x i1 vector value +def v2i1 : ValueType<2, 16>; // 2 x i1 vector value +def v4i1 : ValueType<4, 17>; // 4 x i1 vector value +def v8i1 : ValueType<8, 18>; // 8 x i1 vector value +def v16i1 : ValueType<16, 19>; // 16 x i1 vector value +def v32i1 : ValueType<32, 20>; // 32 x i1 vector value +def v64i1 : ValueType<64, 21>; // 64 x i1 vector value +def v128i1 : ValueType<128, 22>; // 128 x i1 vector value +def v256i1 : ValueType<256, 23>; // 256 x i1 vector value +def v512i1 : ValueType<512, 24>; // 512 x i1 vector value +def v1024i1 : ValueType<1024, 25>; // 1024 x i1 vector value + +def v1i8 : ValueType<8, 26>; // 1 x i8 vector value +def v2i8 : ValueType<16, 27>; // 2 x i8 vector value +def v4i8 : ValueType<32, 28>; // 4 x i8 vector value +def v8i8 : ValueType<64, 29>; // 8 x i8 vector value +def v16i8 : ValueType<128, 30>; // 16 x i8 vector value +def v32i8 : ValueType<256, 31>; // 32 x i8 vector value +def v64i8 : ValueType<512, 32>; // 64 x i8 vector value +def v128i8 : ValueType<1024, 33>; // 128 x i8 vector value +def v256i8 : ValueType<2048, 34>; // 256 x i8 vector value + +def v1i16 : ValueType<16, 35>; // 1 x i16 vector value +def v2i16 : ValueType<32, 36>; // 2 x i16 vector value +def v3i16 : ValueType<48, 37>; // 3 x i16 vector value +def v4i16 : ValueType<64, 38>; // 4 x i16 vector value +def v8i16 : ValueType<128, 39>; // 8 x i16 vector value +def v16i16 : ValueType<256, 40>; // 16 x i16 vector value +def v32i16 : ValueType<512, 41>; // 32 x i16 vector value +def v64i16 : ValueType<1024, 42>; // 64 x i16 vector value +def v128i16 : ValueType<2048, 43>; // 128 x i16 vector value + +def v1i32 : ValueType<32, 44>; // 1 x i32 vector value +def v2i32 : ValueType<64, 45>; // 2 x i32 vector value +def v3i32 : ValueType<96, 46>; // 3 x i32 vector value +def v4i32 : ValueType<128, 47>; // 4 x i32 vector value +def v5i32 : ValueType<160, 48>; // 5 x i32 vector value +def v8i32 : ValueType<256, 49>; // 8 x i32 vector value +def v16i32 : ValueType<512, 50>; // 16 x i32 vector value +def v32i32 : ValueType<1024, 51>; // 32 x i32 vector value +def v64i32 : ValueType<2048, 52>; // 64 x i32 vector value +def v128i32 : ValueType<4096, 53>; // 128 x i32 vector value +def v256i32 : ValueType<8182, 54>; // 256 x i32 vector value +def v512i32 : ValueType<16384, 55>; // 512 x i32 vector value +def v1024i32 : ValueType<32768, 56>; // 1024 x i32 vector value +def v2048i32 : ValueType<65536, 57>; // 2048 x i32 vector value + +def v1i64 : ValueType<64, 58>; // 1 x i64 vector value +def v2i64 : ValueType<128, 59>; // 2 x i64 vector value +def v4i64 : ValueType<256, 60>; // 4 x i64 vector value +def v8i64 : ValueType<512, 61>; // 8 x i64 vector value +def v16i64 : ValueType<1024, 62>; // 16 x i64 vector value +def v32i64 : ValueType<2048, 63>; // 32 x i64 vector value +def v64i64 : ValueType<4096, 64>; // 64 x i64 vector value +def v128i64 : ValueType<8192, 65>; // 128 x i64 vector value +def v256i64 : ValueType<16384, 66>; // 256 x i64 vector value + +def v1i128 : ValueType<128, 67>; // 1 x i128 vector value + +def v1f16 : ValueType<16, 68>; // 1 x f16 vector value +def v2f16 : ValueType<32, 69>; // 2 x f16 vector value +def v3f16 : ValueType<48, 70>; // 3 x f16 vector value +def v4f16 : ValueType<64, 71>; // 4 x f16 vector value +def v8f16 : ValueType<128, 72>; // 8 x f16 vector value +def v16f16 : ValueType<256, 73>; // 16 x f16 vector value +def v32f16 : ValueType<512, 74>; // 32 x f16 vector value +def v64f16 : ValueType<1024, 75>; // 64 x f16 vector value +def v128f16 : ValueType<2048, 76>; // 128 x f16 vector value +def v2bf16 : ValueType<32, 77>; // 2 x bf16 vector value +def v3bf16 : ValueType<48, 78>; // 3 x bf16 vector value +def v4bf16 : ValueType<64, 79>; // 4 x bf16 vector value +def v8bf16 : ValueType<128, 80>; // 8 x bf16 vector value +def v16bf16 : ValueType<256, 81>; // 16 x bf16 vector value +def v32bf16 : ValueType<512, 82>; // 32 x bf16 vector value +def v64bf16 : ValueType<1024, 83>; // 64 x bf16 vector value +def v128bf16 : ValueType<2048, 84>; // 128 x bf16 vector value +def v1f32 : ValueType<32, 85>; // 1 x f32 vector value +def v2f32 : ValueType<64, 86>; // 2 x f32 vector value +def v3f32 : ValueType<96, 87>; // 3 x f32 vector value +def v4f32 : ValueType<128, 88>; // 4 x f32 vector value +def v5f32 : ValueType<160, 89>; // 5 x f32 vector value +def v8f32 : ValueType<256, 90>; // 8 x f32 vector value +def v16f32 : ValueType<512, 91>; // 16 x f32 vector value +def v32f32 : ValueType<1024, 92>; // 32 x f32 vector value +def v64f32 : ValueType<2048, 93>; // 64 x f32 vector value +def v128f32 : ValueType<4096, 94>; // 128 x f32 vector value +def v256f32 : ValueType<8182, 95>; // 256 x f32 vector value +def v512f32 : ValueType<16384, 96>; // 512 x f32 vector value +def v1024f32 : ValueType<32768, 97>; // 1024 x f32 vector value +def v2048f32 : ValueType<65536, 98>; // 2048 x f32 vector value +def v1f64 : ValueType<64, 99>; // 1 x f64 vector value +def v2f64 : ValueType<128, 100>; // 2 x f64 vector value +def v4f64 : ValueType<256, 101>; // 4 x f64 vector value +def v8f64 : ValueType<512, 102>; // 8 x f64 vector value +def v16f64 : ValueType<1024, 103>; // 16 x f64 vector value +def v32f64 : ValueType<2048, 104>; // 32 x f64 vector value +def v64f64 : ValueType<4096, 105>; // 64 x f64 vector value +def v128f64 : ValueType<8192, 106>; // 128 x f64 vector value +def v256f64 : ValueType<16384, 107>; // 256 x f64 vector value def nxv1i1 : ValueType<1, 108>; // n x 1 x i1 vector value def nxv2i1 : ValueType<2, 109>; // n x 2 x i1 vector value @@ -140,7 +140,7 @@ def nxv4i1 : ValueType<4, 110>; // n x 4 x i1 vector value def nxv8i1 : ValueType<8, 111>; // n x 8 x i1 vector value def nxv16i1 : ValueType<16, 112>; // n x 16 x i1 vector value def nxv32i1 : ValueType<32, 113>; // n x 32 x i1 vector value -def nxv64i1 : ValueType<64,114>; // n x 64 x i1 vector value +def nxv64i1 : ValueType<64, 114>; // n x 64 x i1 vector value def nxv1i8 : ValueType<8, 115>; // n x 1 x i8 vector value def nxv2i8 : ValueType<16, 116>; // n x 2 x i8 vector value @@ -148,79 +148,79 @@ def nxv4i8 : ValueType<32, 117>; // n x 4 x i8 vector value def nxv8i8 : ValueType<64, 118>; // n x 8 x i8 vector value def nxv16i8 : ValueType<128, 119>; // n x 16 x i8 vector value def nxv32i8 : ValueType<256, 120>; // n x 32 x i8 vector value -def nxv64i8 : ValueType<512, 121>; // n x 64 x i8 vector value - -def nxv1i16 : ValueType<16, 122>; // n x 1 x i16 vector value -def nxv2i16 : ValueType<32, 123>; // n x 2 x i16 vector value -def nxv4i16 : ValueType<64, 124>; // n x 4 x i16 vector value -def nxv8i16 : ValueType<128, 125>; // n x 8 x i16 vector value -def nxv16i16: ValueType<256, 126>; // n x 16 x i16 vector value -def nxv32i16: ValueType<512, 127>; // n x 32 x i16 vector value - -def nxv1i32 : ValueType<32, 128>; // n x 1 x i32 vector value -def nxv2i32 : ValueType<64, 129>; // n x 2 x i32 vector value -def nxv4i32 : ValueType<128, 130>; // n x 4 x i32 vector value -def nxv8i32 : ValueType<256, 131>; // n x 8 x i32 vector value -def nxv16i32: ValueType<512, 132>; // n x 16 x i32 vector value -def nxv32i32: ValueType<1024,133>; // n x 32 x i32 vector value - -def nxv1i64 : ValueType<64, 134>; // n x 1 x i64 vector value -def nxv2i64 : ValueType<128, 135>; // n x 2 x i64 vector value -def nxv4i64 : ValueType<256, 136>; // n x 4 x i64 vector value -def nxv8i64 : ValueType<512, 137>; // n x 8 x i64 vector value -def nxv16i64: ValueType<1024,138>; // n x 16 x i64 vector value -def nxv32i64: ValueType<2048,139>; // n x 32 x i64 vector value - -def nxv1f16 : ValueType<16, 140>; // n x 1 x f16 vector value -def nxv2f16 : ValueType<32, 141>; // n x 2 x f16 vector value -def nxv4f16 : ValueType<64, 142>; // n x 4 x f16 vector value -def nxv8f16 : ValueType<128,143>; // n x 8 x f16 vector value -def nxv16f16 : ValueType<256,144>; // n x 16 x f16 vector value -def nxv32f16 : ValueType<512,145>; // n x 32 x f16 vector value -def nxv1bf16 : ValueType<16, 146>; // n x 1 x bf16 vector value -def nxv2bf16 : ValueType<32, 147>; // n x 2 x bf16 vector value -def nxv4bf16 : ValueType<64, 148>; // n x 4 x bf16 vector value -def nxv8bf16 : ValueType<128,149>; // n x 8 x bf16 vector value -def nxv1f32 : ValueType<32, 150>; // n x 1 x f32 vector value -def nxv2f32 : ValueType<64, 151>; // n x 2 x f32 vector value -def nxv4f32 : ValueType<128,152>; // n x 4 x f32 vector value -def nxv8f32 : ValueType<256,153>; // n x 8 x f32 vector value -def nxv16f32 : ValueType<512,154>; // n x 16 x f32 vector value -def nxv1f64 : ValueType<64, 155>; // n x 1 x f64 vector value -def nxv2f64 : ValueType<128,156>; // n x 2 x f64 vector value -def nxv4f64 : ValueType<256,157>; // n x 4 x f64 vector value -def nxv8f64 : ValueType<512,158>; // n x 8 x f64 vector value - -def x86mmx : ValueType<64, 159>; // X86 MMX value -def FlagVT : ValueType<0, 160>; // Pre-RA sched glue -def isVoid : ValueType<0, 161>; // Produces no value -def untyped: ValueType<8, 162>; // Produces an untyped value -def funcref : ValueType<0, 163>; // WebAssembly's funcref type -def externref : ValueType<0, 164>; // WebAssembly's externref type -def x86amx : ValueType<8192, 165>; // X86 AMX value - - -def token : ValueType<0 , 248>; // TokenTy -def MetadataVT: ValueType<0,249>; // Metadata +def nxv64i8 : ValueType<512, 121>; // n x 64 x i8 vector value + +def nxv1i16 : ValueType<16, 122>; // n x 1 x i16 vector value +def nxv2i16 : ValueType<32, 123>; // n x 2 x i16 vector value +def nxv4i16 : ValueType<64, 124>; // n x 4 x i16 vector value +def nxv8i16 : ValueType<128, 125>; // n x 8 x i16 vector value +def nxv16i16 : ValueType<256, 126>; // n x 16 x i16 vector value +def nxv32i16 : ValueType<512, 127>; // n x 32 x i16 vector value + +def nxv1i32 : ValueType<32, 128>; // n x 1 x i32 vector value +def nxv2i32 : ValueType<64, 129>; // n x 2 x i32 vector value +def nxv4i32 : ValueType<128, 130>; // n x 4 x i32 vector value +def nxv8i32 : ValueType<256, 131>; // n x 8 x i32 vector value +def nxv16i32 : ValueType<512, 132>; // n x 16 x i32 vector value +def nxv32i32 : ValueType<1024, 133>; // n x 32 x i32 vector value + +def nxv1i64 : ValueType<64, 134>; // n x 1 x i64 vector value +def nxv2i64 : ValueType<128, 135>; // n x 2 x i64 vector value +def nxv4i64 : ValueType<256, 136>; // n x 4 x i64 vector value +def nxv8i64 : ValueType<512, 137>; // n x 8 x i64 vector value +def nxv16i64 : ValueType<1024, 138>; // n x 16 x i64 vector value +def nxv32i64 : ValueType<2048, 139>; // n x 32 x i64 vector value + +def nxv1f16 : ValueType<16, 140>; // n x 1 x f16 vector value +def nxv2f16 : ValueType<32, 141>; // n x 2 x f16 vector value +def nxv4f16 : ValueType<64, 142>; // n x 4 x f16 vector value +def nxv8f16 : ValueType<128, 143>; // n x 8 x f16 vector value +def nxv16f16 : ValueType<256, 144>; // n x 16 x f16 vector value +def nxv32f16 : ValueType<512, 145>; // n x 32 x f16 vector value +def nxv1bf16 : ValueType<16, 146>; // n x 1 x bf16 vector value +def nxv2bf16 : ValueType<32, 147>; // n x 2 x bf16 vector value +def nxv4bf16 : ValueType<64, 148>; // n x 4 x bf16 vector value +def nxv8bf16 : ValueType<128, 149>; // n x 8 x bf16 vector value +def nxv1f32 : ValueType<32, 150>; // n x 1 x f32 vector value +def nxv2f32 : ValueType<64, 151>; // n x 2 x f32 vector value +def nxv4f32 : ValueType<128, 152>; // n x 4 x f32 vector value +def nxv8f32 : ValueType<256, 153>; // n x 8 x f32 vector value +def nxv16f32 : ValueType<512, 154>; // n x 16 x f32 vector value +def nxv1f64 : ValueType<64, 155>; // n x 1 x f64 vector value +def nxv2f64 : ValueType<128, 156>; // n x 2 x f64 vector value +def nxv4f64 : ValueType<256, 157>; // n x 4 x f64 vector value +def nxv8f64 : ValueType<512, 158>; // n x 8 x f64 vector value + +def x86mmx : ValueType<64, 159>; // X86 MMX value +def FlagVT : ValueType<0, 160>; // Pre-RA sched glue +def isVoid : ValueType<0, 161>; // Produces no value +def untyped : ValueType<8, 162>; // Produces an untyped value +def funcref : ValueType<0, 163>; // WebAssembly's funcref type +def externref : ValueType<0, 164>; // WebAssembly's externref type +def x86amx : ValueType<8192, 165>; // X86 AMX value + + +def token : ValueType<0, 248>; // TokenTy +def MetadataVT : ValueType<0, 249>; // Metadata // Pseudo valuetype mapped to the current pointer size to any address space. // Should only be used in TableGen. -def iPTRAny : ValueType<0, 250>; +def iPTRAny : ValueType<0, 250>; // Pseudo valuetype to represent "vector of any size" -def vAny : ValueType<0 , 251>; +def vAny : ValueType<0, 251>; // Pseudo valuetype to represent "float of any format" -def fAny : ValueType<0 , 252>; +def fAny : ValueType<0, 252>; // Pseudo valuetype to represent "integer of any bit width" -def iAny : ValueType<0 , 253>; +def iAny : ValueType<0, 253>; // Pseudo valuetype mapped to the current pointer size. -def iPTR : ValueType<0 , 254>; +def iPTR : ValueType<0, 254>; // Pseudo valuetype to represent "any type of any size". -def Any : ValueType<0 , 255>; +def Any : ValueType<0, 255>; /// This class is for targets that want to use pointer types in patterns /// with the GlobalISelEmitter. Targets must define their own pointer diff --git a/llvm/include/llvm/Support/MachineValueType.h b/llvm/include/llvm/Support/MachineValueType.h index e663953d0577..d01056887271 100644 --- a/llvm/include/llvm/Support/MachineValueType.h +++ b/llvm/include/llvm/Support/MachineValueType.h @@ -70,25 +70,25 @@ namespace llvm { v512i1 = 24, // 512 x i1 v1024i1 = 25, // 1024 x i1 - v1i8 = 26, // 1 x i8 - v2i8 = 27, // 2 x i8 - v4i8 = 28, // 4 x i8 - v8i8 = 29, // 8 x i8 - v16i8 = 30, // 16 x i8 - v32i8 = 31, // 32 x i8 - v64i8 = 32, // 64 x i8 - v128i8 = 33, //128 x i8 - v256i8 = 34, //256 x i8 - - v1i16 = 35, // 1 x i16 - v2i16 = 36, // 2 x i16 - v3i16 = 37, // 3 x i16 - v4i16 = 38, // 4 x i16 - v8i16 = 39, // 8 x i16 - v16i16 = 40, // 16 x i16 - v32i16 = 41, // 32 x i16 - v64i16 = 42, // 64 x i16 - v128i16 = 43, //128 x i16 + v1i8 = 26, // 1 x i8 + v2i8 = 27, // 2 x i8 + v4i8 = 28, // 4 x i8 + v8i8 = 29, // 8 x i8 + v16i8 = 30, // 16 x i8 + v32i8 = 31, // 32 x i8 + v64i8 = 32, // 64 x i8 + v128i8 = 33, // 128 x i8 + v256i8 = 34, // 256 x i8 + + v1i16 = 35, // 1 x i16 + v2i16 = 36, // 2 x i16 + v3i16 = 37, // 3 x i16 + v4i16 = 38, // 4 x i16 + v8i16 = 39, // 8 x i16 + v16i16 = 40, // 16 x i16 + v32i16 = 41, // 32 x i16 + v64i16 = 42, // 64 x i16 + v128i16 = 43, // 128 x i16 v1i32 = 44, // 1 x i32 v2i32 = 45, // 2 x i32 @@ -105,13 +105,13 @@ namespace llvm { v1024i32 = 56, // 1024 x i32 v2048i32 = 57, // 2048 x i32 - v1i64 = 58, // 1 x i64 - v2i64 = 59, // 2 x i64 - v4i64 = 60, // 4 x i64 - v8i64 = 61, // 8 x i64 - v16i64 = 62, // 16 x i64 - v32i64 = 63, // 32 x i64 - v64i64 = 64, // 64 x i64 + v1i64 = 58, // 1 x i64 + v2i64 = 59, // 2 x i64 + v4i64 = 60, // 4 x i64 + v8i64 = 61, // 8 x i64 + v16i64 = 62, // 16 x i64 + v32i64 = 63, // 32 x i64 + v64i64 = 64, // 64 x i64 v128i64 = 65, // 128 x i64 v256i64 = 66, // 256 x i64 @@ -173,7 +173,7 @@ namespace llvm { nxv8i1 = 111, // n x 8 x i1 nxv16i1 = 112, // n x 16 x i1 nxv32i1 = 113, // n x 32 x i1 - nxv64i1 = 114, // n x 64 x i1 + nxv64i1 = 114, // n x 64 x i1 nxv1i8 = 115, // n x 1 x i8 nxv2i8 = 116, // n x 2 x i8 @@ -181,7 +181,7 @@ namespace llvm { nxv8i8 = 118, // n x 8 x i8 nxv16i8 = 119, // n x 16 x i8 nxv32i8 = 120, // n x 32 x i8 - nxv64i8 = 121, // n x 64 x i8 + nxv64i8 = 121, // n x 64 x i8 nxv1i16 = 122, // n x 1 x i16 nxv2i16 = 123, // n x 2 x i16 -- GitLab From 1f4959b27607d4748c83820ffcf8bf24f09fdd47 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 22:45:50 -0700 Subject: [PATCH 0241/1000] [Driver] Drop unneeded $triple/gcc/$triple detection --- clang/lib/Driver/ToolChains/Gnu.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index eb32f4b920b5..906bac57fa77 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2532,12 +2532,6 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( TargetTriple.getVendor() == llvm::Triple::Freescale || TargetTriple.getVendor() == llvm::Triple::OpenEmbedded}, - // Natively multiarch systems sometimes put the GCC triple-specific - // directory within their multiarch lib directory, resulting in the - // triple appearing twice. - {CandidateTriple.str() + "/gcc/" + CandidateTriple.str(), "../../..", - TargetTriple.getOS() != llvm::Triple::Solaris}, - // Deal with cases (on Ubuntu) where the system architecture could be i386 // but the GCC target architecture could be (say) i686. // FIXME: It may be worthwhile to generalize this and look for a second -- GitLab From a6a15dde5a870f0ce6be0ea26d36cec60e846a2d Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 22:50:35 -0700 Subject: [PATCH 0242/1000] [Driver] Delete toplevel i386-gnu/gcc detection in favor of i386-gnu alias triple detection This is used by hurd.c (usr/lib/gcc/i386-gnu/4.6.0) but we can leverage the existing alias triple detection. --- clang/lib/Driver/ToolChains/Gnu.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 906bac57fa77..3c1fc87d7896 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2537,9 +2537,6 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( // FIXME: It may be worthwhile to generalize this and look for a second // triple. {"i386-linux-gnu/gcc/" + CandidateTriple.str(), "../../..", - (TargetArch == llvm::Triple::x86 && - TargetTriple.getOS() != llvm::Triple::Solaris)}, - {"i386-gnu/gcc/" + CandidateTriple.str(), "../../..", (TargetArch == llvm::Triple::x86 && TargetTriple.getOS() != llvm::Triple::Solaris)}}; -- GitLab From bdf39e6b0ed4b41a1842ac0193f30a726f8d9f63 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 23:23:28 -0700 Subject: [PATCH 0243/1000] [Driver] Drop obsoleted Ubuntu 11.04 gcc detection It has a very broken gcc installation path (usr/lib/i386-linux-gnu/gcc/i686-linux-gnu). --- clang/lib/Driver/ToolChains/Gnu.cpp | 11 +---------- clang/test/Driver/gcc-toolchain.cpp | 24 ++++++++++------------- clang/test/Driver/linux-header-search.cpp | 17 ---------------- clang/test/Driver/linux-ld.c | 15 -------------- 4 files changed, 11 insertions(+), 56 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 3c1fc87d7896..3491a29a5f9c 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2506,7 +2506,6 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( const llvm::Triple &TargetTriple, const ArgList &Args, const std::string &LibDir, StringRef CandidateTriple, bool NeedsBiarchSuffix, bool GCCDirExists, bool GCCCrossDirExists) { - llvm::Triple::ArchType TargetArch = TargetTriple.getArch(); // Locations relative to the system lib directory where GCC's triple-specific // directories might reside. struct GCCLibSuffix { @@ -2530,15 +2529,7 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( // files in that location, not just GCC installation data. {CandidateTriple.str(), "..", TargetTriple.getVendor() == llvm::Triple::Freescale || - TargetTriple.getVendor() == llvm::Triple::OpenEmbedded}, - - // Deal with cases (on Ubuntu) where the system architecture could be i386 - // but the GCC target architecture could be (say) i686. - // FIXME: It may be worthwhile to generalize this and look for a second - // triple. - {"i386-linux-gnu/gcc/" + CandidateTriple.str(), "../../..", - (TargetArch == llvm::Triple::x86 && - TargetTriple.getOS() != llvm::Triple::Solaris)}}; + TargetTriple.getVendor() == llvm::Triple::OpenEmbedded}}; for (auto &Suffix : Suffixes) { if (!Suffix.Active) diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp index cddf9b1bdbca..0a642c824e6a 100644 --- a/clang/test/Driver/gcc-toolchain.cpp +++ b/clang/test/Driver/gcc-toolchain.cpp @@ -1,34 +1,30 @@ // Test that gcc-toolchain option is working correctly // // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -// RUN: --target=i386-unknown-linux -stdlib=libstdc++ \ -// RUN: --gcc-toolchain=%S/Inputs/ubuntu_11.04_multiarch_tree/usr \ -// RUN: --sysroot="" \ -// RUN: | FileCheck %s +// RUN: --target=x86_64-linux-gnu --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr | \ +// RUN: FileCheck %s // // Additionally check that the legacy spelling of the flag works. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -// RUN: --target=i386-unknown-linux -stdlib=libstdc++ \ -// RUN: -gcc-toolchain %S/Inputs/ubuntu_11.04_multiarch_tree/usr \ -// RUN: --sysroot="" \ -// RUN: | FileCheck %s +// RUN: --target=x86_64-linux-gnu -gcc-toolchain %S/Inputs/ubuntu_14.04_multiarch_tree/usr | \ +// RUN: FileCheck %s // // Test for header search toolchain detection. // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN:[^"]+]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5" +// CHECK: "[[TOOLCHAIN:[^"]+]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8" // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/i686-linux-gnu" +// CHECK: "[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8" // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/backward" +// CHECK: "[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/backward" // CHECK: "-internal-isystem" "/usr/local/include" // // Test for linker toolchain detection. Note that only the '-L' flags will use // the same precise formatting of the path as the '-internal-system' flags // above, so we just blanket wildcard match the 'crtbegin.o'. // CHECK: "{{[^"]*}}ld{{(.exe)?}}" -// CHECK: "{{[^"]*}}/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5{{/|\\\\}}crtbegin.o" -// CHECK: "-L[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5" -// CHECK: "-L[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../.." +// CHECK-SAME: "{{[^"]*}}/usr/lib/gcc/x86_64-linux-gnu/4.8{{/|\\\\}}crtbegin.o" +// CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8" +// CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu" /// Test we don't detect GCC installation under -B. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ diff --git a/clang/test/Driver/linux-header-search.cpp b/clang/test/Driver/linux-header-search.cpp index 8c1fc99d79f3..4aed02f9c15d 100644 --- a/clang/test/Driver/linux-header-search.cpp +++ b/clang/test/Driver/linux-header-search.cpp @@ -67,23 +67,6 @@ // CHECK-BASIC-LIBSTDCXX-LIBCXXV2-SYSROOT: "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v2" // CHECK-BASIC-LIBSTDCXX-LIBCXXV2-SYSROOT: "-internal-isystem" "[[SYSROOT]]/usr/local/include" // -// Test a very broken version of multiarch that shipped in Ubuntu 11.04. -// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \ -// RUN: -target i386-unknown-linux -stdlib=libstdc++ \ -// RUN: --sysroot=%S/Inputs/ubuntu_11.04_multiarch_tree \ -// RUN: --gcc-toolchain="" \ -// RUN: | FileCheck --check-prefix=CHECK-UBUNTU-11-04 %s -// CHECK-UBUNTU-11-04: "{{.*}}clang{{.*}}" "-cc1" -// CHECK-UBUNTU-11-04: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" -// CHECK-UBUNTU-11-04: "-isysroot" "[[SYSROOT:[^"]+]]" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/i686-linux-gnu" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/backward" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/local/include" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include" -// CHECK-UBUNTU-11-04: "-internal-externc-isystem" "[[SYSROOT]]/include" -// CHECK-UBUNTU-11-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include" -// // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \ // RUN: -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \ // RUN: --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \ diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c index 1aa955737438..8ba57a941443 100644 --- a/clang/test/Driver/linux-ld.c +++ b/clang/test/Driver/linux-ld.c @@ -565,21 +565,6 @@ // CHECK-BASIC-LIBCXX-C-LINK: "--sysroot=[[SYSROOT]]" // CHECK-BASIC-LIBCXX-C-LINK: "-L[[SYSROOT]]/usr/bin/../lib" // -// Test a very broken version of multiarch that shipped in Ubuntu 11.04. -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ -// RUN: --target=i386-unknown-linux -rtlib=platform \ -// RUN: --gcc-toolchain="" \ -// RUN: --sysroot=%S/Inputs/ubuntu_11.04_multiarch_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBUNTU-11-04 %s -// CHECK-UBUNTU-11-04: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-UBUNTU-11-04: "{{.*}}/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5{{/|\\\\}}crtbegin.o" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../i386-linux-gnu" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../.." -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/lib" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib" -// // Check multi arch support on Ubuntu 12.04 LTS. // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: --target=arm-unknown-linux-gnueabihf -rtlib=platform \ -- GitLab From b98ad2ac0845835b1d58faf6881b688e3e186b84 Mon Sep 17 00:00:00 2001 From: joker881 Date: Sat, 20 Mar 2021 12:37:30 +0800 Subject: [PATCH 0244/1000] Title: Remove a redundant parameter in clang/unittests/AST/CMakeLists.txt Reviewed by: MaskRay Differential Revision: https://reviews.llvm.org/D98922 --- clang/unittests/AST/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/unittests/AST/CMakeLists.txt b/clang/unittests/AST/CMakeLists.txt index 2d5d0172afed..979d59bd0f39 100644 --- a/clang/unittests/AST/CMakeLists.txt +++ b/clang/unittests/AST/CMakeLists.txt @@ -13,7 +13,6 @@ add_clang_unittest(ASTTests ASTImporterVisibilityTest.cpp ASTTraverserTest.cpp ASTTypeTraitsTest.cpp - ASTTraverserTest.cpp ASTVectorTest.cpp CommentLexer.cpp CommentParser.cpp -- GitLab From bed9933a461e7b3d0c8c5a8fa770aa1b49802660 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Fri, 19 Mar 2021 23:50:22 -0700 Subject: [PATCH 0245/1000] [Driver][test] Fix gcc-toolchain.cpp on non-x86_64 --- clang/test/Driver/gcc-toolchain.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp index 0a642c824e6a..03a7991d6c70 100644 --- a/clang/test/Driver/gcc-toolchain.cpp +++ b/clang/test/Driver/gcc-toolchain.cpp @@ -24,7 +24,8 @@ // CHECK: "{{[^"]*}}ld{{(.exe)?}}" // CHECK-SAME: "{{[^"]*}}/usr/lib/gcc/x86_64-linux-gnu/4.8{{/|\\\\}}crtbegin.o" // CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8" -// CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu" +/// On x86_64, there is an extra usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu but we should not test it. +// CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.." /// Test we don't detect GCC installation under -B. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -- GitLab From 319d093b87a89712573d159da019ce363ae51430 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Sun, 21 Mar 2021 02:14:06 +0900 Subject: [PATCH 0246/1000] [CFLGraph] Fix a crash due to missing handling of freeze https://reviews.llvm.org/D85534#2636321 --- llvm/lib/Analysis/CFLGraph.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/llvm/lib/Analysis/CFLGraph.h b/llvm/lib/Analysis/CFLGraph.h index 21842ed36487..02a13d673f40 100644 --- a/llvm/lib/Analysis/CFLGraph.h +++ b/llvm/lib/Analysis/CFLGraph.h @@ -284,6 +284,13 @@ template class CFLGraphBuilder { addAssignEdge(Src, &Inst); } + void visitFreezeInst(FreezeInst &Inst) { + // Accessing freeze(ptr) is equivalent to accessing ptr. + // The former raises UB iff latter raises UB. + auto *Src = Inst.getOperand(0); + addAssignEdge(Src, &Inst); + } + void visitBinaryOperator(BinaryOperator &Inst) { auto *Op1 = Inst.getOperand(0); auto *Op2 = Inst.getOperand(1); -- GitLab From 5657f93e788f093c70fb448dd6f9398b149df278 Mon Sep 17 00:00:00 2001 From: Butygin Date: Fri, 12 Mar 2021 17:39:43 +0300 Subject: [PATCH 0247/1000] [mlir] Canonicalize IfOp with trivial `then` and `else` bodies to list of SelectOp's * Do we need a threshold on maximum number of Yeild arguments processed (maximum number of SelectOp's to be generated)? * Had to modify some old IfOp tests to not get optimized by this pattern Differential Revision: https://reviews.llvm.org/D98592 --- mlir/lib/Dialect/SCF/SCF.cpp | 40 ++++++++++- mlir/test/Dialect/SCF/canonicalize.mlir | 96 +++++++++++++++++++++++++ 2 files changed, 135 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/SCF/SCF.cpp b/mlir/lib/Dialect/SCF/SCF.cpp index fdb9df82900c..78c72953ee6f 100644 --- a/mlir/lib/Dialect/SCF/SCF.cpp +++ b/mlir/lib/Dialect/SCF/SCF.cpp @@ -934,11 +934,49 @@ struct RemoveStaticCondition : public OpRewritePattern { return success(); } }; + +struct ConvertTrivialIfToSelect : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(IfOp op, + PatternRewriter &rewriter) const override { + if (op->getNumResults() == 0) + return failure(); + + if (!llvm::hasSingleElement(op.thenRegion().front()) || + !llvm::hasSingleElement(op.elseRegion().front())) + return failure(); + + auto cond = op.condition(); + auto thenYieldArgs = + cast(op.thenRegion().front().getTerminator()) + .getOperands(); + auto elseYieldArgs = + cast(op.elseRegion().front().getTerminator()) + .getOperands(); + SmallVector results(op->getNumResults()); + assert(thenYieldArgs.size() == results.size()); + assert(elseYieldArgs.size() == results.size()); + for (auto it : llvm::enumerate(llvm::zip(thenYieldArgs, elseYieldArgs))) { + Value trueVal = std::get<0>(it.value()); + Value falseVal = std::get<1>(it.value()); + if (trueVal == falseVal) + results[it.index()] = trueVal; + else + results[it.index()] = + rewriter.create(op.getLoc(), cond, trueVal, falseVal); + } + + rewriter.replaceOp(op, results); + return success(); + } +}; } // namespace void IfOp::getCanonicalizationPatterns(OwningRewritePatternList &results, MLIRContext *context) { - results.insert(context); + results.insert(context); } //===----------------------------------------------------------------------===// diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir index dffe9e252eb1..7c751623db86 100644 --- a/mlir/test/Dialect/SCF/canonicalize.mlir +++ b/mlir/test/Dialect/SCF/canonicalize.mlir @@ -35,10 +35,12 @@ func @single_iteration(%A: memref) { // ----- +func private @side_effect() func @one_unused(%cond: i1) -> (index) { %c0 = constant 0 : index %c1 = constant 1 : index %0, %1 = scf.if %cond -> (index, index) { + call @side_effect() : () -> () scf.yield %c0, %c1 : index, index } else { scf.yield %c0, %c1 : index, index @@ -49,6 +51,7 @@ func @one_unused(%cond: i1) -> (index) { // CHECK-LABEL: func @one_unused // CHECK: [[C0:%.*]] = constant 1 : index // CHECK: [[V0:%.*]] = scf.if %{{.*}} -> (index) { +// CHECK: call @side_effect() : () -> () // CHECK: scf.yield [[C0]] : index // CHECK: } else // CHECK: scf.yield [[C0]] : index @@ -57,11 +60,13 @@ func @one_unused(%cond: i1) -> (index) { // ----- +func private @side_effect() func @nested_unused(%cond1: i1, %cond2: i1) -> (index) { %c0 = constant 0 : index %c1 = constant 1 : index %0, %1 = scf.if %cond1 -> (index, index) { %2, %3 = scf.if %cond2 -> (index, index) { + call @side_effect() : () -> () scf.yield %c0, %c1 : index, index } else { scf.yield %c0, %c1 : index, index @@ -77,6 +82,7 @@ func @nested_unused(%cond1: i1, %cond2: i1) -> (index) { // CHECK: [[C0:%.*]] = constant 1 : index // CHECK: [[V0:%.*]] = scf.if {{.*}} -> (index) { // CHECK: [[V1:%.*]] = scf.if {{.*}} -> (index) { +// CHECK: call @side_effect() : () -> () // CHECK: scf.yield [[C0]] : index // CHECK: } else // CHECK: scf.yield [[C0]] : index @@ -113,6 +119,96 @@ func @all_unused(%cond: i1) { // ----- +func @empty_if1(%cond: i1) { + scf.if %cond { + scf.yield + } + return +} + +// CHECK-LABEL: func @empty_if1 +// CHECK-NOT: scf.if +// CHECK: return + +// ----- + +func @empty_if2(%cond: i1) { + scf.if %cond { + scf.yield + } else { + scf.yield + } + return +} + +// CHECK-LABEL: func @empty_if2 +// CHECK-NOT: scf.if +// CHECK: return + +// ----- + +func @to_select1(%cond: i1) -> index { + %c0 = constant 0 : index + %c1 = constant 1 : index + %0 = scf.if %cond -> index { + scf.yield %c0 : index + } else { + scf.yield %c1 : index + } + return %0 : index +} + +// CHECK-LABEL: func @to_select1 +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: [[V0:%.*]] = select {{.*}}, [[C0]], [[C1]] +// CHECK: return [[V0]] : index + +// ----- + +func @to_select_same_val(%cond: i1) -> (index, index) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %0, %1 = scf.if %cond -> (index, index) { + scf.yield %c0, %c1 : index, index + } else { + scf.yield %c1, %c1 : index, index + } + return %0, %1 : index, index +} + +// CHECK-LABEL: func @to_select_same_val +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: [[V0:%.*]] = select {{.*}}, [[C0]], [[C1]] +// CHECK: return [[V0]], [[C1]] : index, index + +// ----- + +func @to_select2(%cond: i1) -> (index, index) { + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c3 = constant 3 : index + %0, %1 = scf.if %cond -> (index, index) { + scf.yield %c0, %c1 : index, index + } else { + scf.yield %c2, %c3 : index, index + } + return %0, %1 : index, index +} + +// CHECK-LABEL: func @to_select2 +// CHECK: [[C0:%.*]] = constant 0 : index +// CHECK: [[C1:%.*]] = constant 1 : index +// CHECK: [[C2:%.*]] = constant 2 : index +// CHECK: [[C3:%.*]] = constant 3 : index +// CHECK: [[V0:%.*]] = select {{.*}}, [[C0]], [[C2]] +// CHECK: [[V1:%.*]] = select {{.*}}, [[C1]], [[C3]] +// CHECK: return [[V0]], [[V1]] : index + +// ----- + func private @make_i32() -> i32 func @for_yields_2(%lb : index, %ub : index, %step : index) -> i32 { -- GitLab From 2327513b853f030ff399413a651974ab23de4e1b Mon Sep 17 00:00:00 2001 From: "Wang, Pengfei" Date: Sat, 20 Mar 2021 12:55:46 +0800 Subject: [PATCH 0248/1000] [X86] Fix a bug when calculating the ldtilecfg insertion points. The BB we initialized the ldtilecfg is special. We don't need to check if its predecessor BBs need to insert ldtilecfg for calls. We reused the flag HasCallBeforeAMX, so that the predecessors won't be added to CfgNeedInsert. This case happens only when the entry BB is in a loop. We need to hoist the first tile config point out of the loop in future. Reviewed By: LuoYuanke Differential Revision: https://reviews.llvm.org/D98845 --- llvm/lib/Target/X86/X86PreTileConfig.cpp | 6 ++++++ llvm/test/CodeGen/X86/AMX/amx-across-func.ll | 7 +++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index dd35a5d1c057..cd5d3d6d90d7 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -296,6 +296,12 @@ static void reloadTileConfig(MachineInstr *MI, int FI, MachineBasicBlock *MBB = MI->getParent(); BBVisitedInfo[MBB] = BBInfo(CfgNeedInsert, MBB, MI); + // The entry BB is special, since it always has a ldtilecfg before AMX + // instruction. We don't need to check if its predecessor BBs have call. + // FIXME: This case happens only when the entry BB is in a loop. We need to + // hoist the first tile config point out of the loop in future. + BBVisitedInfo[MBB].HasCallBeforeAMX = true; + WorkList.push_back(MBB); while (!WorkList.empty()) { MBB = WorkList.pop_back_val(); diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll index 2bb73e26c431..d8d18a74961b 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -280,15 +280,14 @@ define dso_local void @test_loop2(i32 %0) nounwind { ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: callq foo -; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) -; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: testl %ebx, %ebx ; CHECK-NEXT: jle .LBB3_3 ; CHECK-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: tileloadd (%r14,%r15), %tmm0 ; CHECK-NEXT: movabsq $64, %rax -- GitLab From 77080a1eb6061df2dcfae8ac84b85ad4d1e02031 Mon Sep 17 00:00:00 2001 From: Jeroen Dobbelaere Date: Sat, 20 Mar 2021 11:37:09 +0100 Subject: [PATCH 0249/1000] Revert of D49126 [PredicateInfo] Use custom mangling to support ssa_copy with unnamed types. Now that intrinsic name mangling can cope with unnamed types, the custom name mangling in PredicateInfo (introduced by D49126) can be removed. (See D91250, D48541) Reviewed By: fhahn Differential Revision: https://reviews.llvm.org/D91661 --- .../llvm/Transforms/Utils/PredicateInfo.h | 6 +- llvm/lib/Transforms/Utils/PredicateInfo.cpp | 60 +---- llvm/test/Other/debugcounter-predicateinfo.ll | 4 +- .../Transforms/Util/PredicateInfo/condprop.ll | 42 ++-- .../Transforms/Util/PredicateInfo/diamond.ll | 8 +- .../Transforms/Util/PredicateInfo/edge.ll | 18 +- .../Util/PredicateInfo/testandor.ll | 208 +++++++++--------- .../Util/PredicateInfo/unnamed-types.ll | 4 +- 8 files changed, 147 insertions(+), 203 deletions(-) diff --git a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h index c922476ac79d..c4030735d965 100644 --- a/llvm/include/llvm/Transforms/Utils/PredicateInfo.h +++ b/llvm/include/llvm/Transforms/Utils/PredicateInfo.h @@ -51,13 +51,11 @@ #define LLVM_TRANSFORMS_UTILS_PREDICATEINFO_H #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/PassManager.h" #include "llvm/IR/Value.h" -#include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" namespace llvm { @@ -178,7 +176,7 @@ public: class PredicateInfo { public: PredicateInfo(Function &, DominatorTree &, AssumptionCache &); - ~PredicateInfo(); + ~PredicateInfo() = default; void verifyPredicateInfo() const; @@ -205,8 +203,6 @@ private: // the Predicate Info, they belong to the ValueInfo structs in the ValueInfos // vector. DenseMap PredicateMap; - // The set of ssa_copy declarations we created with our custom mangling. - SmallSet, 20> CreatedDeclarations; }; // This pass does eager building and then printing of PredicateInfo. It is used diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 4c262f60014c..91280762aaa7 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -16,7 +16,6 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/IR/AssemblyAnnotationWriter.h" @@ -24,7 +23,6 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -539,21 +537,6 @@ void PredicateInfoBuilder::buildPredicateInfo() { renameUses(OpsToRename); } -// Create a ssa_copy declaration with custom mangling, because -// Intrinsic::getDeclaration does not handle overloaded unnamed types properly: -// all unnamed types get mangled to the same string. We use the pointer -// to the type as name here, as it guarantees unique names for different -// types and we remove the declarations when destroying PredicateInfo. -// It is a workaround for PR38117, because solving it in a fully general way is -// tricky (FIXME). -static Function *getCopyDeclaration(Module *M, Type *Ty) { - std::string Name = "llvm.ssa.copy." + utostr((uintptr_t) Ty); - return cast( - M->getOrInsertFunction(Name, - getType(M->getContext(), Intrinsic::ssa_copy, Ty)) - .getCallee()); -} - // Given the renaming stack, make all the operands currently on the stack real // by inserting them into the IR. Return the last operation's value. Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, @@ -585,9 +568,8 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, // order in the case of multiple predicateinfo in the same block. if (isa(ValInfo)) { IRBuilder<> B(getBranchTerminator(ValInfo)); - Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); - if (IF->users().empty()) - PI.CreatedDeclarations.insert(IF); + Function *IF = Intrinsic::getDeclaration( + F.getParent(), Intrinsic::ssa_copy, Op->getType()); CallInst *PIC = B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++)); PI.PredicateMap.insert({PIC, ValInfo}); @@ -599,9 +581,8 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, // Insert the predicate directly after the assume. While it also holds // directly before it, assume(i1 true) is not a useful fact. IRBuilder<> B(PAssume->AssumeInst->getNextNode()); - Function *IF = getCopyDeclaration(F.getParent(), Op->getType()); - if (IF->users().empty()) - PI.CreatedDeclarations.insert(IF); + Function *IF = Intrinsic::getDeclaration( + F.getParent(), Intrinsic::ssa_copy, Op->getType()); CallInst *PIC = B.CreateCall(IF, Op); PI.PredicateMap.insert({PIC, ValInfo}); Result.Def = PIC; @@ -780,23 +761,6 @@ PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT, Builder.buildPredicateInfo(); } -// Remove all declarations we created . The PredicateInfo consumers are -// responsible for remove the ssa_copy calls created. -PredicateInfo::~PredicateInfo() { - // Collect function pointers in set first, as SmallSet uses a SmallVector - // internally and we have to remove the asserting value handles first. - SmallPtrSet FunctionPtrs; - for (auto &F : CreatedDeclarations) - FunctionPtrs.insert(&*F); - CreatedDeclarations.clear(); - - for (Function *F : FunctionPtrs) { - assert(F->user_begin() == F->user_end() && - "PredicateInfo consumer did not remove all SSA copies."); - F->eraseFromParent(); - } -} - Optional PredicateBase::getConstraint() const { switch (Type) { case PT_Assume: @@ -863,19 +827,6 @@ void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired(); } -// Replace ssa_copy calls created by PredicateInfo with their operand. -static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) { - for (Instruction &Inst : llvm::make_early_inc_range(instructions(F))) { - const auto *PI = PredInfo.getPredicateInfoFor(&Inst); - auto *II = dyn_cast(&Inst); - if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy) - continue; - - Inst.replaceAllUsesWith(II->getOperand(0)); - Inst.eraseFromParent(); - } -} - bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) { auto &DT = getAnalysis().getDomTree(); auto &AC = getAnalysis().getAssumptionCache(F); @@ -883,8 +834,6 @@ bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) { PredInfo->print(dbgs()); if (VerifyPredicateInfo) PredInfo->verifyPredicateInfo(); - - replaceCreatedSSACopys(*PredInfo, F); return false; } @@ -896,7 +845,6 @@ PreservedAnalyses PredicateInfoPrinterPass::run(Function &F, auto PredInfo = std::make_unique(F, DT, AC); PredInfo->print(OS); - replaceCreatedSSACopys(*PredInfo, F); return PreservedAnalyses::all(); } diff --git a/llvm/test/Other/debugcounter-predicateinfo.ll b/llvm/test/Other/debugcounter-predicateinfo.ll index bbc7a0f71271..90303ab0dabc 100644 --- a/llvm/test/Other/debugcounter-predicateinfo.ll +++ b/llvm/test/Other/debugcounter-predicateinfo.ll @@ -8,10 +8,10 @@ define fastcc void @barney() { ; CHECK-NEXT: br label [[BB22:%.*]] ; CHECK: bb22: ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 undef, 2 -; CHECK: [[TMP23_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[TMP23]]) +; CHECK: [[TMP23_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP23]]) ; CHECK-NEXT: br i1 [[TMP23]], label [[BB29:%.*]], label [[BB35:%.*]] ; CHECK: bb29: -; CHECK: [[TMP23_0_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[TMP23_0]]) +; CHECK: [[TMP23_0_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP23_0]]) ; CHECK-NEXT: br i1 [[TMP23]], label [[BB33:%.*]], label [[BB35]] ; CHECK: bb33: ; CHECK-NEXT: br i1 [[TMP23_0_1]], label [[BB35]], label [[BB35]] diff --git a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll index 689326b6ca97..9400e60c81ff 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/condprop.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/condprop.ll @@ -186,10 +186,10 @@ case3: define i1 @test5(i32 %x, i32 %y) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[X_1:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) -; CHECK: [[Y_1:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[Y_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) ; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = icmp ne i32 [[X_0]], [[Y_0]] @@ -259,10 +259,10 @@ different: define i1 @test7(i32 %x, i32 %y) { ; CHECK-LABEL: @test7( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[X_1:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) -; CHECK: [[Y_1:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[Y_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) ; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[X_0]], [[Y_0]] @@ -286,10 +286,10 @@ different: define i1 @test7_fp(float %x, float %y) { ; CHECK-LABEL: @test7_fp( ; CHECK-NEXT: [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]] -; CHECK: [[X_0:%.*]] = call float @llvm.ssa.copy.{{.+}}(float [[X]]) -; CHECK: [[X_1:%.*]] = call float @llvm.ssa.copy.{{.+}}(float [[X]]) -; CHECK: [[Y_0:%.*]] = call float @llvm.ssa.copy.{{.+}}(float [[Y]]) -; CHECK: [[Y_1:%.*]] = call float @llvm.ssa.copy.{{.+}}(float [[Y]]) +; CHECK: [[X_0:%.*]] = call float @llvm.ssa.copy.f32(float [[X]]) +; CHECK: [[X_1:%.*]] = call float @llvm.ssa.copy.f32(float [[X]]) +; CHECK: [[Y_0:%.*]] = call float @llvm.ssa.copy.f32(float [[Y]]) +; CHECK: [[Y_1:%.*]] = call float @llvm.ssa.copy.f32(float [[Y]]) ; CHECK-NEXT: br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]] ; CHECK: same: ; CHECK-NEXT: [[CMP2:%.*]] = fcmp ule float [[X_0]], [[Y_0]] @@ -359,8 +359,8 @@ different: define i32 @test9(i32 %i, i32 %j) { ; CHECK-LABEL: @test9( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]] -; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[I]]) -; CHECK: [[J_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[J]]) +; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]]) +; CHECK: [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]]) ; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]] @@ -382,8 +382,8 @@ ret: define i32 @test10(i32 %j, i32 %i) { ; CHECK-LABEL: @test10( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]] -; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[I]]) -; CHECK: [[J_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[J]]) +; CHECK: [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]]) +; CHECK: [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]]) ; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]] @@ -409,14 +409,14 @@ define i32 @test11(i32 %x) { ; CHECK-NEXT: [[V0:%.*]] = call i32 @yogibar() ; CHECK-NEXT: [[V1:%.*]] = call i32 @yogibar() ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]] -; CHECK: [[V0_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[V0]]) -; CHECK: [[V1_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[V1]]) +; CHECK: [[V0_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V0]]) +; CHECK: [[V1_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V1]]) ; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: ret i32 [[V1_0]] ; CHECK: next: ; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0_0]] -; CHECK: [[V0_0_1:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[V0_0]]) +; CHECK: [[V0_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V0_0]]) ; CHECK-NEXT: br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]] ; CHECK: cond_true2: ; CHECK-NEXT: ret i32 [[V0_0_1]] @@ -445,8 +445,8 @@ next2: define i32 @test12(i32 %x) { ; CHECK-LABEL: @test12( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[X_1:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] ; CHECK: cond_true: ; CHECK-NEXT: br label [[RET:%.*]] diff --git a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll index 8e3da687c139..e3f56d88caf0 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/diamond.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/diamond.ll @@ -5,12 +5,12 @@ define i1 @f(i32 %x, i1 %y) { ; CHECK-NEXT: br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK-NEXT: br i1 [[CMP]], label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[X2:%.*]] = add nuw nsw i32 [[X]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[X2]], 2 -; CHECK: [[X2_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X2]]) +; CHECK: [[X2_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X2]]) ; CHECK-NEXT: br i1 [[CMP2]], label [[BB2]], label [[BB3]] ; CHECK: bb2: ; CHECK-NEXT: [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ] @@ -38,12 +38,12 @@ define i1 @g(i32 %x, i1 %y) { ; CHECK-NEXT: br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]] ; CHECK: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK-NEXT: br i1 [[CMP]], label [[BB3:%.*]], label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: [[X2:%.*]] = add nuw nsw i32 [[X]], 1 ; CHECK-NEXT: [[CMP2:%.*]] = icmp sge i32 [[X2]], 2 -; CHECK: [[X2_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X2]]) +; CHECK: [[X2_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X2]]) ; CHECK-NEXT: br i1 [[CMP2]], label [[BB3]], label [[BB2]] ; CHECK: bb2: ; CHECK-NEXT: [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ] diff --git a/llvm/test/Transforms/Util/PredicateInfo/edge.ll b/llvm/test/Transforms/Util/PredicateInfo/edge.ll index dbd15dc70ba5..2b88e32fd450 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/edge.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/edge.ll @@ -5,7 +5,7 @@ define i32 @f1(i32 %x) { ; CHECK-LABEL: @f1( ; CHECK-NEXT: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK-NEXT: br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB2]] @@ -29,7 +29,7 @@ define i32 @f2(i32 %x) { ; CHECK-LABEL: @f2( ; CHECK-NEXT: bb0: ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0 -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK-NEXT: br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: ; CHECK-NEXT: br label [[BB2]] @@ -52,7 +52,7 @@ bb2: define i32 @f3(i32 %x) { ; CHECK-LABEL: @f3( ; CHECK-NEXT: bb0: -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X:%.*]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X:%.*]]) ; CHECK-NEXT: switch i32 [[X]], label [[BB1:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB2:%.*]] ; CHECK-NEXT: ] @@ -78,7 +78,7 @@ define double @fcmp_oeq_not_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_oeq_not_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 2.000000e+00 -; CHECK: [[Y_0:%.*]] = call double @llvm.ssa.copy.{{.+}}(double [[Y]]) +; CHECK: [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]]) ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] @@ -105,7 +105,7 @@ define double @fcmp_une_not_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_une_not_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], 2.000000e+00 -; CHECK: [[Y_0:%.*]] = call double @llvm.ssa.copy.{{.+}}(double [[Y]]) +; CHECK: [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]]) ; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] @@ -132,7 +132,7 @@ define double @fcmp_oeq_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_oeq_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 0.000000e+00 -; CHECK: [[Y_0:%.*]] = call double @llvm.ssa.copy.{{.+}}(double [[Y]]) +; CHECK: [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]]) ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] @@ -159,7 +159,7 @@ define double @fcmp_une_zero(double %x, double %y) { ; CHECK-LABEL: @fcmp_une_zero( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], -0.000000e+00 -; CHECK: [[Y_0:%.*]] = call double @llvm.ssa.copy.{{.+}}(double [[Y]]) +; CHECK: [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]]) ; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]] @@ -188,7 +188,7 @@ define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2) ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], [[Z]] -; CHECK: [[Z_0:%.*]] = call double @llvm.ssa.copy.{{.+}}(double [[Z]]) +; CHECK: [[Z_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Z]]) ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]] ; CHECK: if: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]] @@ -217,7 +217,7 @@ define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2) ; CHECK-NEXT: entry: ; CHECK-NEXT: [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]] ; CHECK-NEXT: [[CMP:%.*]] = fcmp une double [[Y:%.*]], [[Z]] -; CHECK: [[Z_0:%.*]] = call double @llvm.ssa.copy.{{.+}}(double [[Z]]) +; CHECK: [[Z_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Z]]) ; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]] ; CHECK: else: ; CHECK-NEXT: [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]] diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll index 64ca2664e4f8..9c765fe72b89 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll @@ -10,11 +10,11 @@ define void @test_or(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]] -; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[Z]]) -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[XZ]]) -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[YZ]]) -; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) +; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) +; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) ; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]] ; CHECK: oneof: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) @@ -55,11 +55,11 @@ define void @test_or_logical(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = select i1 [[XZ]], i1 true, i1 [[YZ]] -; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[Z]]) -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[XZ]]) -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[YZ]]) -; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) +; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) +; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) ; CHECK-NEXT: br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]] ; CHECK: oneof: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) @@ -100,11 +100,11 @@ define void @test_and(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] -; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[Z]]) -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[XZ]]) -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[YZ]]) -; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) +; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) +; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ_0]]) @@ -145,11 +145,11 @@ define void @test_and_logical(i32 %x, i32 %y) { ; CHECK-NEXT: [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = select i1 [[XZ]], i1 [[YZ]], i1 false -; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[Z]]) -; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[XZ]]) -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[YZ]]) -; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) +; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) +; CHECK: [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) +; CHECK: [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ_0]]) @@ -190,11 +190,11 @@ define void @testandsame(i32 %x, i32 %y) { ; CHECK-NEXT: [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0 ; CHECK-NEXT: [[XLT:%.*]] = icmp slt i32 [[X]], 100 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XGT]], [[XLT]] -; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[Z]]) -; CHECK: [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[XGT]]) -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[X_0_1:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X_0]]) -; CHECK: [[XLT_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[XLT]]) +; CHECK: [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) +; CHECK: [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[X_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X_0]]) +; CHECK: [[XLT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XLT]]) ; CHECK-NEXT: br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XGT_0]]) @@ -229,16 +229,16 @@ define void @testandassume(i32 %x, i32 %y) { ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = and i1 [[XZ]], [[YZ]] ; CHECK-NEXT: call void @llvm.assume(i1 [[Z]]) -; CHECK: [[TMP1:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[Y]]) -; CHECK: [[TMP2:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[YZ]]) -; CHECK: [[TMP3:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) -; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[XZ]]) -; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[Z]]) -; CHECK: [[DOT0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[TMP5]]) -; CHECK: [[DOT01:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[TMP4]]) -; CHECK: [[DOT02:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[TMP3]]) -; CHECK: [[DOT03:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[TMP2]]) -; CHECK: [[DOT04:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[TMP1]]) +; CHECK: [[TMP1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]]) +; CHECK: [[TMP2:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]]) +; CHECK: [[TMP3:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) +; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]]) +; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) +; CHECK: [[DOT0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP5]]) +; CHECK: [[DOT01:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP4]]) +; CHECK: [[DOT02:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP3]]) +; CHECK: [[DOT03:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP2]]) +; CHECK: [[DOT04:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP1]]) ; CHECK-NEXT: br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[DOT01]]) @@ -274,8 +274,8 @@ define void @testorassume(i32 %x, i32 %y) { ; CHECK-NEXT: [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0 ; CHECK-NEXT: [[Z:%.*]] = or i1 [[XZ]], [[YZ]] ; CHECK-NEXT: call void @llvm.assume(i1 [[Z]]) -; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[Z]]) -; CHECK: [[DOT0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[TMP1]]) +; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]]) +; CHECK: [[DOT0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP1]]) ; CHECK-NEXT: br i1 [[TMP1]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: ; CHECK-NEXT: call void @foo(i1 [[XZ]]) @@ -307,11 +307,11 @@ define void @test_and_one_unknown_cond(i32 %x, i1 %c1) { ; CHECK-LABEL: @test_and_one_unknown_cond( ; CHECK-NEXT: [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[A:%.*]] = and i1 [[C1:%.*]], [[C2]] -; CHECK: [[A_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A]]) -; CHECK: [[A_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A]]) -; CHECK: [[C1_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[C1]]) -; CHECK: [[C2_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[C2]]) -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) +; CHECK: [[A_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A]]) +; CHECK: [[A_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A]]) +; CHECK: [[C1_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[C1]]) +; CHECK: [[C2_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[C2]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK-NEXT: br i1 [[A]], label [[BOTH:%.*]], label [[NOPE:%.*]] ; CHECK: both: ; CHECK-NEXT: call void @bar(i32 [[X_0]]) @@ -349,11 +349,11 @@ define void @test_or_one_unknown_cond(i32 %x, i1 %c1) { ; CHECK-LABEL: @test_or_one_unknown_cond( ; CHECK-NEXT: [[C2:%.*]] = icmp eq i32 [[X:%.*]], 0 ; CHECK-NEXT: [[A:%.*]] = or i1 [[C1:%.*]], [[C2]] -; CHECK: [[A_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A]]) -; CHECK: [[A_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A]]) -; CHECK: [[C1_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[C1]]) -; CHECK: [[C2_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[C2]]) -; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.{{.+}}(i32 [[X]]) +; CHECK: [[A_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A]]) +; CHECK: [[A_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A]]) +; CHECK: [[C1_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[C1]]) +; CHECK: [[C2_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[C2]]) +; CHECK: [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]]) ; CHECK-NEXT: br i1 [[A]], label [[NOPE:%.*]], label [[BOTH_INVERTED:%.*]] ; CHECK: both_inverted: ; CHECK-NEXT: call void @bar(i32 [[X_0]]) @@ -391,12 +391,12 @@ define void @test_and_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_and_chain( ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]] -; CHECK: [[AND2_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[AND2]]) -; CHECK: [[AND2_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[AND2]]) -; CHECK: [[AND1_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[AND1]]) -; CHECK: [[A_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A]]) -; CHECK: [[B_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[B]]) -; CHECK: [[C_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[C]]) +; CHECK: [[AND2_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[AND2]]) +; CHECK: [[AND2_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[AND2]]) +; CHECK: [[AND1_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[AND1]]) +; CHECK: [[A_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A]]) +; CHECK: [[B_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[B]]) +; CHECK: [[C_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[C]]) ; CHECK-NEXT: br i1 [[AND2]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A_0]]) @@ -438,12 +438,12 @@ define void @test_or_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_or_chain( ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]] -; CHECK: [[OR2_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[OR2]]) -; CHECK: [[OR2_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[OR2]]) -; CHECK: [[OR1_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[OR1]]) -; CHECK: [[A_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A]]) -; CHECK: [[B_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[B]]) -; CHECK: [[C_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[C]]) +; CHECK: [[OR2_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[OR2]]) +; CHECK: [[OR2_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[OR2]]) +; CHECK: [[OR1_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[OR1]]) +; CHECK: [[A_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A]]) +; CHECK: [[B_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[B]]) +; CHECK: [[C_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[C]]) ; CHECK-NEXT: br i1 [[OR2]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A]]) @@ -485,10 +485,10 @@ define void @test_and_or_mixed(i1 %a, i1 %b, i1 %c) { ; CHECK-LABEL: @test_and_or_mixed( ; CHECK-NEXT: [[OR:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND:%.*]] = and i1 [[OR]], [[C:%.*]] -; CHECK: [[AND_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[AND]]) -; CHECK: [[AND_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[AND]]) -; CHECK: [[OR_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[OR]]) -; CHECK: [[C_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[C]]) +; CHECK: [[AND_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[AND]]) +; CHECK: [[AND_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[AND]]) +; CHECK: [[OR_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[OR]]) +; CHECK: [[C_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[C]]) ; CHECK-NEXT: br i1 [[AND]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A]]) @@ -542,15 +542,15 @@ define void @test_deep_and_chain(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = and i1 [[A12]], true ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], true ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], true -; CHECK: [[A15_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A15]]) -; CHECK: [[A15_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A15]]) -; CHECK: [[A14_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A14]]) -; CHECK: [[A13_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A13]]) -; CHECK: [[A12_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A12]]) -; CHECK: [[A11_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A11]]) -; CHECK: [[A10_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A10]]) -; CHECK: [[A9_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A9]]) -; CHECK: [[A8_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A8]]) +; CHECK: [[A15_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A15]]) +; CHECK: [[A15_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A15]]) +; CHECK: [[A14_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A14]]) +; CHECK: [[A13_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A13]]) +; CHECK: [[A12_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A12]]) +; CHECK: [[A11_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A11]]) +; CHECK: [[A10_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A10]]) +; CHECK: [[A9_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A9]]) +; CHECK: [[A8_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A8]]) ; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) @@ -656,15 +656,15 @@ define void @test_deep_and_tree(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = and i1 [[A12]], [[A12]] ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], [[A14]] -; CHECK: [[A15_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A15]]) -; CHECK: [[A15_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A15]]) -; CHECK: [[A14_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A14]]) -; CHECK: [[A13_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A13]]) -; CHECK: [[A12_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A12]]) -; CHECK: [[A11_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A11]]) -; CHECK: [[A10_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A10]]) -; CHECK: [[A9_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A9]]) -; CHECK: [[A8_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A8]]) +; CHECK: [[A15_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A15]]) +; CHECK: [[A15_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A15]]) +; CHECK: [[A14_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A14]]) +; CHECK: [[A13_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A13]]) +; CHECK: [[A12_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A12]]) +; CHECK: [[A11_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A11]]) +; CHECK: [[A10_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A10]]) +; CHECK: [[A9_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A9]]) +; CHECK: [[A8_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A8]]) ; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) @@ -770,15 +770,15 @@ define void @test_deep_or_tree(i1 %a1) { ; CHECK-NEXT: [[A13:%.*]] = or i1 [[A12]], [[A12]] ; CHECK-NEXT: [[A14:%.*]] = or i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = or i1 [[A14]], [[A14]] -; CHECK: [[A15_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A15]]) -; CHECK: [[A15_1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A15]]) -; CHECK: [[A14_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A14]]) -; CHECK: [[A13_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A13]]) -; CHECK: [[A12_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A12]]) -; CHECK: [[A11_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A11]]) -; CHECK: [[A10_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A10]]) -; CHECK: [[A9_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A9]]) -; CHECK: [[A8_0:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A8]]) +; CHECK: [[A15_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A15]]) +; CHECK: [[A15_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A15]]) +; CHECK: [[A14_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A14]]) +; CHECK: [[A13_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A13]]) +; CHECK: [[A12_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A12]]) +; CHECK: [[A11_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A11]]) +; CHECK: [[A10_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A10]]) +; CHECK: [[A9_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A9]]) +; CHECK: [[A8_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A8]]) ; CHECK-NEXT: br i1 [[A15]], label [[IF:%.*]], label [[ELSE:%.*]] ; CHECK: if: ; CHECK-NEXT: call void @foo(i1 [[A1]]) @@ -873,11 +873,11 @@ define void @test_assume_and_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-NEXT: [[AND1:%.*]] = and i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[AND2:%.*]] = and i1 [[AND1]], [[C:%.*]] ; CHECK-NEXT: call void @llvm.assume(i1 [[AND2]]) -; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[C]]) -; CHECK: [[TMP2:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[B]]) -; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A]]) -; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[AND1]]) -; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[AND2]]) +; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[C]]) +; CHECK: [[TMP2:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[B]]) +; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A]]) +; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[AND1]]) +; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[AND2]]) ; CHECK-NEXT: call void @foo(i1 [[TMP3]]) ; CHECK-NEXT: call void @foo(i1 [[TMP2]]) ; CHECK-NEXT: call void @foo(i1 [[TMP1]]) @@ -901,7 +901,7 @@ define void @test_assume_or_chain(i1 %a, i1 %b, i1 %c) { ; CHECK-NEXT: [[OR1:%.*]] = or i1 [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: [[OR2:%.*]] = or i1 [[OR1]], [[C:%.*]] ; CHECK-NEXT: call void @llvm.assume(i1 [[OR2]]) -; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[OR2]]) +; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[OR2]]) ; CHECK-NEXT: call void @foo(i1 [[A]]) ; CHECK-NEXT: call void @foo(i1 [[B]]) ; CHECK-NEXT: call void @foo(i1 [[C]]) @@ -937,14 +937,14 @@ define void @test_assume_deep_and_tree(i1 %a1) { ; CHECK-NEXT: [[A14:%.*]] = and i1 [[A13]], [[A13]] ; CHECK-NEXT: [[A15:%.*]] = and i1 [[A14]], [[A14]] ; CHECK-NEXT: call void @llvm.assume(i1 [[A15]]) -; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A8]]) -; CHECK: [[TMP2:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A9]]) -; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A10]]) -; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A11]]) -; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A12]]) -; CHECK: [[TMP6:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A13]]) -; CHECK: [[TMP7:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A14]]) -; CHECK: [[TMP8:%.*]] = call i1 @llvm.ssa.copy.{{.+}}(i1 [[A15]]) +; CHECK: [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A8]]) +; CHECK: [[TMP2:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A9]]) +; CHECK: [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A10]]) +; CHECK: [[TMP4:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A11]]) +; CHECK: [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A12]]) +; CHECK: [[TMP6:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A13]]) +; CHECK: [[TMP7:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A14]]) +; CHECK: [[TMP8:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[A15]]) ; CHECK-NEXT: call void @foo(i1 [[A1]]) ; CHECK-NEXT: call void @foo(i1 [[A2]]) ; CHECK-NEXT: call void @foo(i1 [[A3]]) diff --git a/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll b/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll index d1e0f358fc9f..13575e7caa66 100644 --- a/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll +++ b/llvm/test/Transforms/Util/PredicateInfo/unnamed-types.ll @@ -8,12 +8,12 @@ ; CHECK-LABEL: bb: ; CHECK: Has predicate info ; CHECK: branch predicate info { TrueEdge: 1 Comparison: %cmp1 = icmp ne %0* %arg, null Edge: [label %bb,label %bb1], RenamedOp: %arg } -; CHECK-NEXT: %arg.0 = call %0* @llvm.ssa.copy.{{.+}}(%0* %arg) +; CHECK-NEXT: %arg.0 = call %0* @llvm.ssa.copy.p0s_s.{{.+}}(%0* %arg) ; CHECK-LABEL: bb1: ; CHECK: Has predicate info ; CHECK-NEXT: branch predicate info { TrueEdge: 0 Comparison: %cmp2 = icmp ne %1* null, %tmp Edge: [label %bb1,label %bb3], RenamedOp: %tmp } -; CHECK-NEXT: %tmp.0 = call %1* @llvm.ssa.copy.{{.+}}(%1* %tmp) +; CHECK-NEXT: %tmp.0 = call %1* @llvm.ssa.copy.p0s_s.{{.+}}(%1* %tmp) define void @f0(%0* %arg, %1* %tmp) { bb: -- GitLab From 7219b31d40f14604c669d633b014d0cc8b707cf3 Mon Sep 17 00:00:00 2001 From: Butygin Date: Fri, 12 Mar 2021 17:39:43 +0300 Subject: [PATCH 0250/1000] [mlir] Additional folding for SelectOp * Fold SelectOp when both true and false args are same SSA value * Fold some cmp + select patterns Differential Revision: https://reviews.llvm.org/D98576 --- mlir/lib/Dialect/StandardOps/IR/Ops.cpp | 27 ++++++++++++++++-- mlir/test/Dialect/Standard/canonicalize.mlir | 29 ++++++++++++++++++++ 2 files changed, 54 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp index bd38e154bcf6..4830a51827a5 100644 --- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp +++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp @@ -1360,15 +1360,38 @@ static LogicalResult verify(ReturnOp op) { //===----------------------------------------------------------------------===// OpFoldResult SelectOp::fold(ArrayRef operands) { + auto trueVal = getTrueValue(); + auto falseVal = getFalseValue(); + if (trueVal == falseVal) + return trueVal; + auto condition = getCondition(); // select true, %0, %1 => %0 if (matchPattern(condition, m_One())) - return getTrueValue(); + return trueVal; // select false, %0, %1 => %1 if (matchPattern(condition, m_Zero())) - return getFalseValue(); + return falseVal; + + if (auto cmp = dyn_cast_or_null(condition.getDefiningOp())) { + auto pred = cmp.predicate(); + if (pred == mlir::CmpIPredicate::eq || pred == mlir::CmpIPredicate::ne) { + auto cmpLhs = cmp.lhs(); + auto cmpRhs = cmp.rhs(); + + // %0 = cmpi eq, %arg0, %arg1 + // %1 = select %0, %arg0, %arg1 => %arg1 + + // %0 = cmpi ne, %arg0, %arg1 + // %1 = select %0, %arg0, %arg1 => %arg0 + + if ((cmpLhs == trueVal && cmpRhs == falseVal) || + (cmpRhs == trueVal && cmpLhs == falseVal)) + return pred == mlir::CmpIPredicate::ne ? trueVal : falseVal; + } + } return nullptr; } diff --git a/mlir/test/Dialect/Standard/canonicalize.mlir b/mlir/test/Dialect/Standard/canonicalize.mlir index a6bf0c78321a..77022024ae48 100644 --- a/mlir/test/Dialect/Standard/canonicalize.mlir +++ b/mlir/test/Dialect/Standard/canonicalize.mlir @@ -339,3 +339,32 @@ func @subtensor_insert_output_dest_canonicalize(%arg0 : tensor<2x3xi32>, %arg1 : // CHECK: %[[GENERATE:.+]] = tensor.generate // CHECK: %[[RESULT:.+]] = subtensor_insert %[[ARG0]] into %[[GENERATE]] // CHECK: return %[[RESULT]] + +// ----- + +// CHECK-LABEL: @select_same_val +// CHECK: return %arg1 +func @select_same_val(%arg0: i1, %arg1: i64) -> i64 { + %0 = select %arg0, %arg1, %arg1 : i64 + return %0 : i64 +} + +// ----- + +// CHECK-LABEL: @select_cmp_eq_select +// CHECK: return %arg1 +func @select_cmp_eq_select(%arg0: i64, %arg1: i64) -> i64 { + %0 = cmpi eq, %arg0, %arg1 : i64 + %1 = select %0, %arg0, %arg1 : i64 + return %1 : i64 +} + +// ----- + +// CHECK-LABEL: @select_cmp_ne_select +// CHECK: return %arg0 +func @select_cmp_ne_select(%arg0: i64, %arg1: i64) -> i64 { + %0 = cmpi ne, %arg0, %arg1 : i64 + %1 = select %0, %arg0, %arg1 : i64 + return %1 : i64 +} -- GitLab From 4dd92d61dbc4b3c51a98e1d0bfccabed24759ba9 Mon Sep 17 00:00:00 2001 From: Nathan James Date: Sat, 20 Mar 2021 10:59:36 +0000 Subject: [PATCH 0251/1000] [clang-tidy] Fix bugprone-terminating-continue when continue appears inside a switch Don't emit a warning if the `continue` appears in a switch context as changing it to `break` will break out of the switch rather than a do loop containing the switch. Fixes https://llvm.org/PR49492. Reviewed By: aaron.ballman Differential Revision: https://reviews.llvm.org/D98338 --- .../bugprone/TerminatingContinueCheck.cpp | 9 +++++---- .../checkers/bugprone-terminating-continue.cpp | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.cpp index 43402a221218..65da4c325de4 100644 --- a/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/TerminatingContinueCheck.cpp @@ -26,10 +26,11 @@ void TerminatingContinueCheck::registerMatchers(MatchFinder *Finder) { equalsBoundNode("closestLoop")); Finder->addMatcher( - continueStmt(hasAncestor(stmt(anyOf(forStmt(), whileStmt(), - cxxForRangeStmt(), doStmt())) - .bind("closestLoop")), - hasAncestor(DoWithFalse)) + continueStmt( + hasAncestor(stmt(anyOf(forStmt(), whileStmt(), cxxForRangeStmt(), + doStmt(), switchStmt())) + .bind("closestLoop")), + hasAncestor(DoWithFalse)) .bind("continue"), this); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-terminating-continue.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-terminating-continue.cpp index 4bdcbc42fc47..04fc4a80ea7d 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone-terminating-continue.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-terminating-continue.cpp @@ -32,6 +32,15 @@ void f() { // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: 'continue' in loop with false condition is equivalent to 'break' [bugprone-terminating-continue] // CHECK-FIXES: if (x > 0) break; } while (false); + + switch (2) { + case 2: + do { + continue; // LoopInSwitch + // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: 'continue' in loop with false condition is equivalent to 'break' [bugprone-terminating-continue] + // CHECK-FIXES: break; // LoopInSwitch + } while (0); + } } void g() { @@ -62,4 +71,12 @@ void g() { if (n>2) continue; } } while (false); + + do { + switch (2) { + case 1: + case 2: + continue; + } + } while (false); } -- GitLab From 243333ef3ec6c1e3910eb442177c2e2e927e6a87 Mon Sep 17 00:00:00 2001 From: David Zarzycki Date: Sat, 20 Mar 2021 07:29:01 -0400 Subject: [PATCH 0252/1000] Revert "[Driver] Drop obsoleted Ubuntu 11.04 gcc detection" This reverts commit bdf39e6b0ed4b41a1842ac0193f30a726f8d9f63. The change is failing on Fedora 33 (x86-64). --- clang/lib/Driver/ToolChains/Gnu.cpp | 11 +++++++++- clang/test/Driver/gcc-toolchain.cpp | 25 +++++++++++++---------- clang/test/Driver/linux-header-search.cpp | 17 +++++++++++++++ clang/test/Driver/linux-ld.c | 15 ++++++++++++++ 4 files changed, 56 insertions(+), 12 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 3491a29a5f9c..3c1fc87d7896 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2506,6 +2506,7 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( const llvm::Triple &TargetTriple, const ArgList &Args, const std::string &LibDir, StringRef CandidateTriple, bool NeedsBiarchSuffix, bool GCCDirExists, bool GCCCrossDirExists) { + llvm::Triple::ArchType TargetArch = TargetTriple.getArch(); // Locations relative to the system lib directory where GCC's triple-specific // directories might reside. struct GCCLibSuffix { @@ -2529,7 +2530,15 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( // files in that location, not just GCC installation data. {CandidateTriple.str(), "..", TargetTriple.getVendor() == llvm::Triple::Freescale || - TargetTriple.getVendor() == llvm::Triple::OpenEmbedded}}; + TargetTriple.getVendor() == llvm::Triple::OpenEmbedded}, + + // Deal with cases (on Ubuntu) where the system architecture could be i386 + // but the GCC target architecture could be (say) i686. + // FIXME: It may be worthwhile to generalize this and look for a second + // triple. + {"i386-linux-gnu/gcc/" + CandidateTriple.str(), "../../..", + (TargetArch == llvm::Triple::x86 && + TargetTriple.getOS() != llvm::Triple::Solaris)}}; for (auto &Suffix : Suffixes) { if (!Suffix.Active) diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp index 03a7991d6c70..cddf9b1bdbca 100644 --- a/clang/test/Driver/gcc-toolchain.cpp +++ b/clang/test/Driver/gcc-toolchain.cpp @@ -1,31 +1,34 @@ // Test that gcc-toolchain option is working correctly // // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -// RUN: --target=x86_64-linux-gnu --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr | \ -// RUN: FileCheck %s +// RUN: --target=i386-unknown-linux -stdlib=libstdc++ \ +// RUN: --gcc-toolchain=%S/Inputs/ubuntu_11.04_multiarch_tree/usr \ +// RUN: --sysroot="" \ +// RUN: | FileCheck %s // // Additionally check that the legacy spelling of the flag works. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -// RUN: --target=x86_64-linux-gnu -gcc-toolchain %S/Inputs/ubuntu_14.04_multiarch_tree/usr | \ -// RUN: FileCheck %s +// RUN: --target=i386-unknown-linux -stdlib=libstdc++ \ +// RUN: -gcc-toolchain %S/Inputs/ubuntu_11.04_multiarch_tree/usr \ +// RUN: --sysroot="" \ +// RUN: | FileCheck %s // // Test for header search toolchain detection. // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN:[^"]+]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8" +// CHECK: "[[TOOLCHAIN:[^"]+]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5" // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8" +// CHECK: "[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/i686-linux-gnu" // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/backward" +// CHECK: "[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/backward" // CHECK: "-internal-isystem" "/usr/local/include" // // Test for linker toolchain detection. Note that only the '-L' flags will use // the same precise formatting of the path as the '-internal-system' flags // above, so we just blanket wildcard match the 'crtbegin.o'. // CHECK: "{{[^"]*}}ld{{(.exe)?}}" -// CHECK-SAME: "{{[^"]*}}/usr/lib/gcc/x86_64-linux-gnu/4.8{{/|\\\\}}crtbegin.o" -// CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8" -/// On x86_64, there is an extra usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu but we should not test it. -// CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.." +// CHECK: "{{[^"]*}}/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5{{/|\\\\}}crtbegin.o" +// CHECK: "-L[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5" +// CHECK: "-L[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../.." /// Test we don't detect GCC installation under -B. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ diff --git a/clang/test/Driver/linux-header-search.cpp b/clang/test/Driver/linux-header-search.cpp index 4aed02f9c15d..8c1fc99d79f3 100644 --- a/clang/test/Driver/linux-header-search.cpp +++ b/clang/test/Driver/linux-header-search.cpp @@ -67,6 +67,23 @@ // CHECK-BASIC-LIBSTDCXX-LIBCXXV2-SYSROOT: "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v2" // CHECK-BASIC-LIBSTDCXX-LIBCXXV2-SYSROOT: "-internal-isystem" "[[SYSROOT]]/usr/local/include" // +// Test a very broken version of multiarch that shipped in Ubuntu 11.04. +// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \ +// RUN: -target i386-unknown-linux -stdlib=libstdc++ \ +// RUN: --sysroot=%S/Inputs/ubuntu_11.04_multiarch_tree \ +// RUN: --gcc-toolchain="" \ +// RUN: | FileCheck --check-prefix=CHECK-UBUNTU-11-04 %s +// CHECK-UBUNTU-11-04: "{{.*}}clang{{.*}}" "-cc1" +// CHECK-UBUNTU-11-04: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" +// CHECK-UBUNTU-11-04: "-isysroot" "[[SYSROOT:[^"]+]]" +// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5" +// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/i686-linux-gnu" +// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/backward" +// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/local/include" +// CHECK-UBUNTU-11-04: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include" +// CHECK-UBUNTU-11-04: "-internal-externc-isystem" "[[SYSROOT]]/include" +// CHECK-UBUNTU-11-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include" +// // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \ // RUN: -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \ // RUN: --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \ diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c index 8ba57a941443..1aa955737438 100644 --- a/clang/test/Driver/linux-ld.c +++ b/clang/test/Driver/linux-ld.c @@ -565,6 +565,21 @@ // CHECK-BASIC-LIBCXX-C-LINK: "--sysroot=[[SYSROOT]]" // CHECK-BASIC-LIBCXX-C-LINK: "-L[[SYSROOT]]/usr/bin/../lib" // +// Test a very broken version of multiarch that shipped in Ubuntu 11.04. +// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ +// RUN: --target=i386-unknown-linux -rtlib=platform \ +// RUN: --gcc-toolchain="" \ +// RUN: --sysroot=%S/Inputs/ubuntu_11.04_multiarch_tree \ +// RUN: | FileCheck --check-prefix=CHECK-UBUNTU-11-04 %s +// CHECK-UBUNTU-11-04: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" +// CHECK-UBUNTU-11-04: "{{.*}}/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5{{/|\\\\}}crtbegin.o" +// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5" +// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../i386-linux-gnu" +// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu" +// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../.." +// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/lib" +// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib" +// // Check multi arch support on Ubuntu 12.04 LTS. // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: --target=arm-unknown-linux-gnueabihf -rtlib=platform \ -- GitLab From 5cbe2279f723f1cca1a542d95e7d9760e4f52240 Mon Sep 17 00:00:00 2001 From: David Zarzycki Date: Sat, 20 Mar 2021 07:52:08 -0400 Subject: [PATCH 0253/1000] [lit] Sort testing summary output As fallout from from the record-and-reorder work, people asked that the summary output be sorted to aid diffing. --- llvm/utils/lit/lit/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/utils/lit/lit/main.py b/llvm/utils/lit/lit/main.py index c108c0015653..70a31110f796 100755 --- a/llvm/utils/lit/lit/main.py +++ b/llvm/utils/lit/lit/main.py @@ -295,7 +295,7 @@ def print_results(tests, elapsed, opts): tests_by_code[test.result.code].append(test) for code in lit.Test.ResultCode.all_codes(): - print_group(tests_by_code[code], code, opts.shown_codes) + print_group(sorted(tests_by_code[code], key=lambda t: t.getFullName()), code, opts.shown_codes) print_summary(tests_by_code, opts.quiet, elapsed) -- GitLab From f860187ea6e9b30e1ecf74784f0af0e0c9ecc01c Mon Sep 17 00:00:00 2001 From: Vaivaswatha Nagaraj Date: Fri, 19 Mar 2021 19:35:13 +0530 Subject: [PATCH 0254/1000] [OCaml] Add (get/set)_module_identifer functions Also: - Fix a bug that crept in when fixing a buildbot failure in https://github.com/llvm/llvm-project/commit/f7be9db6220cb39f0eaa12d2af3abedf0d86c303 - Use mlsize_t for cstr_to_string as that is what caml_alloc_string specifies. Differential Revision: https://reviews.llvm.org/D98851 --- llvm/bindings/ocaml/llvm/llvm.ml | 7 +++++++ llvm/bindings/ocaml/llvm/llvm.mli | 8 ++++++++ llvm/bindings/ocaml/llvm/llvm_ocaml.c | 17 +++++++++++++++-- llvm/bindings/ocaml/llvm/llvm_ocaml.h | 2 +- llvm/test/Bindings/OCaml/core.ml | 4 ++++ 5 files changed, 35 insertions(+), 3 deletions(-) diff --git a/llvm/bindings/ocaml/llvm/llvm.ml b/llvm/bindings/ocaml/llvm/llvm.ml index 243f872fe029..9e55ea8c4364 100644 --- a/llvm/bindings/ocaml/llvm/llvm.ml +++ b/llvm/bindings/ocaml/llvm/llvm.ml @@ -442,6 +442,13 @@ external string_of_llmodule : llmodule -> string = "llvm_string_of_llmodule" external set_module_inline_asm : llmodule -> string -> unit = "llvm_set_module_inline_asm" external module_context : llmodule -> llcontext = "LLVMGetModuleContext" + +external get_module_identifier : llmodule -> string + = "llvm_get_module_identifier" + +external set_module_identifer : llmodule -> string -> unit + = "llvm_set_module_identifier" + external get_module_flag : llmodule -> string -> llmetadata option = "llvm_get_module_flag" external add_module_flag : llmodule -> ModuleFlagBehavior.t -> diff --git a/llvm/bindings/ocaml/llvm/llvm.mli b/llvm/bindings/ocaml/llvm/llvm.mli index d65260dc7d0f..c191382aee22 100644 --- a/llvm/bindings/ocaml/llvm/llvm.mli +++ b/llvm/bindings/ocaml/llvm/llvm.mli @@ -543,6 +543,14 @@ val set_module_inline_asm : llmodule -> string -> unit See the method [llvm::Module::getContext] *) val module_context : llmodule -> llcontext +(** [get_module_identifier m] returns the module identifier of the + specified module. See the method [llvm::Module::getModuleIdentifier] *) +val get_module_identifier : llmodule -> string + +(** [set_module_identifier m id] sets the module identifier of [m] + to [id]. See the method [llvm::Module::setModuleIdentifier] *) +val set_module_identifer : llmodule -> string -> unit + (** [get_module_flag m k] Return the corresponding value if key [k] appears in the module flags of [m], otherwise return None See the method [llvm::Module::getModuleFlag] *) diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c index 104635bb6c3a..04f9796baf0c 100644 --- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c +++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c @@ -44,12 +44,12 @@ CAMLprim value ptr_to_option(void *Ptr) { CAMLreturn(Option); } -CAMLprim value cstr_to_string(const unsigned char *Str, unsigned Len) { +CAMLprim value cstr_to_string(const unsigned char *Str, mlsize_t Len) { CAMLparam0(); CAMLlocal1(String); if (Str) { String = caml_alloc_string(Len); - memcpy(String_val(Str), Str, Len); + memcpy(String_val(String), Str, Len); } else { String = caml_alloc_string(0); } @@ -335,6 +335,19 @@ CAMLprim value llvm_string_of_llmodule(LLVMModuleRef M) { CAMLreturn(ModuleStr); } +/* llmodule -> string */ +CAMLprim value llvm_get_module_identifier(LLVMModuleRef M) { + size_t Len; + const char *Name = LLVMGetModuleIdentifier(M, &Len); + return cstr_to_string(Name, (mlsize_t)Len); +} + +/* llmodule -> string -> unit */ +CAMLprim value llvm_set_module_identifier(LLVMModuleRef M, value Id) { + LLVMSetModuleIdentifier(M, String_val(Id), caml_string_length(Id)); + return Val_unit; +} + /* llmodule -> string -> unit */ CAMLprim value llvm_set_module_inline_asm(LLVMModuleRef M, value Asm) { LLVMSetModuleInlineAsm(M, String_val(Asm)); diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.h b/llvm/bindings/ocaml/llvm/llvm_ocaml.h index 0b39b4730360..c52f7ed63650 100644 --- a/llvm/bindings/ocaml/llvm/llvm_ocaml.h +++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.h @@ -25,6 +25,6 @@ CAMLprim value ptr_to_option(void *Ptr); /* Convert a C string into an OCaml string */ -CAMLprim value cstr_to_string(const unsigned char *Str, unsigned Len); +CAMLprim value cstr_to_string(const unsigned char *Str, mlsize_t Len); #endif // LLVM_LLVM_OCAML_H diff --git a/llvm/test/Bindings/OCaml/core.ml b/llvm/test/Bindings/OCaml/core.ml index e1bb6b056142..532171a1842c 100644 --- a/llvm/test/Bindings/OCaml/core.ml +++ b/llvm/test/Bindings/OCaml/core.ml @@ -596,6 +596,10 @@ let test_global_variables () = begin group "iteration"; let m = create_module context "temp" in + insist (get_module_identifier m = "temp"); + set_module_identifer m "temp2"; + insist (get_module_identifier m = "temp2"); + insist (At_end m = global_begin m); insist (At_start m = global_end m); -- GitLab From dc3b438c8f34a54ba9648c97a02764319bd1aca8 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 09:57:05 -0700 Subject: [PATCH 0255/1000] Revert "Revert "[Driver] Drop obsoleted Ubuntu 11.04 gcc detection"" This reverts commit 243333ef3ec6c1e3910eb442177c2e2e927e6a87. --- clang/lib/Driver/ToolChains/Gnu.cpp | 11 +--------- clang/test/Driver/gcc-toolchain.cpp | 25 ++++++++++------------- clang/test/Driver/linux-header-search.cpp | 17 --------------- clang/test/Driver/linux-ld.c | 15 -------------- 4 files changed, 12 insertions(+), 56 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 3c1fc87d7896..3491a29a5f9c 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2506,7 +2506,6 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( const llvm::Triple &TargetTriple, const ArgList &Args, const std::string &LibDir, StringRef CandidateTriple, bool NeedsBiarchSuffix, bool GCCDirExists, bool GCCCrossDirExists) { - llvm::Triple::ArchType TargetArch = TargetTriple.getArch(); // Locations relative to the system lib directory where GCC's triple-specific // directories might reside. struct GCCLibSuffix { @@ -2530,15 +2529,7 @@ void Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple( // files in that location, not just GCC installation data. {CandidateTriple.str(), "..", TargetTriple.getVendor() == llvm::Triple::Freescale || - TargetTriple.getVendor() == llvm::Triple::OpenEmbedded}, - - // Deal with cases (on Ubuntu) where the system architecture could be i386 - // but the GCC target architecture could be (say) i686. - // FIXME: It may be worthwhile to generalize this and look for a second - // triple. - {"i386-linux-gnu/gcc/" + CandidateTriple.str(), "../../..", - (TargetArch == llvm::Triple::x86 && - TargetTriple.getOS() != llvm::Triple::Solaris)}}; + TargetTriple.getVendor() == llvm::Triple::OpenEmbedded}}; for (auto &Suffix : Suffixes) { if (!Suffix.Active) diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp index cddf9b1bdbca..03a7991d6c70 100644 --- a/clang/test/Driver/gcc-toolchain.cpp +++ b/clang/test/Driver/gcc-toolchain.cpp @@ -1,34 +1,31 @@ // Test that gcc-toolchain option is working correctly // // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -// RUN: --target=i386-unknown-linux -stdlib=libstdc++ \ -// RUN: --gcc-toolchain=%S/Inputs/ubuntu_11.04_multiarch_tree/usr \ -// RUN: --sysroot="" \ -// RUN: | FileCheck %s +// RUN: --target=x86_64-linux-gnu --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr | \ +// RUN: FileCheck %s // // Additionally check that the legacy spelling of the flag works. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -// RUN: --target=i386-unknown-linux -stdlib=libstdc++ \ -// RUN: -gcc-toolchain %S/Inputs/ubuntu_11.04_multiarch_tree/usr \ -// RUN: --sysroot="" \ -// RUN: | FileCheck %s +// RUN: --target=x86_64-linux-gnu -gcc-toolchain %S/Inputs/ubuntu_14.04_multiarch_tree/usr | \ +// RUN: FileCheck %s // // Test for header search toolchain detection. // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN:[^"]+]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5" +// CHECK: "[[TOOLCHAIN:[^"]+]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8" // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/i686-linux-gnu" +// CHECK: "[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8" // CHECK: "-internal-isystem" -// CHECK: "[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/backward" +// CHECK: "[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/backward" // CHECK: "-internal-isystem" "/usr/local/include" // // Test for linker toolchain detection. Note that only the '-L' flags will use // the same precise formatting of the path as the '-internal-system' flags // above, so we just blanket wildcard match the 'crtbegin.o'. // CHECK: "{{[^"]*}}ld{{(.exe)?}}" -// CHECK: "{{[^"]*}}/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5{{/|\\\\}}crtbegin.o" -// CHECK: "-L[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5" -// CHECK: "-L[[TOOLCHAIN]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../.." +// CHECK-SAME: "{{[^"]*}}/usr/lib/gcc/x86_64-linux-gnu/4.8{{/|\\\\}}crtbegin.o" +// CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8" +/// On x86_64, there is an extra usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu but we should not test it. +// CHECK-SAME: "-L[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.." /// Test we don't detect GCC installation under -B. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ diff --git a/clang/test/Driver/linux-header-search.cpp b/clang/test/Driver/linux-header-search.cpp index 8c1fc99d79f3..4aed02f9c15d 100644 --- a/clang/test/Driver/linux-header-search.cpp +++ b/clang/test/Driver/linux-header-search.cpp @@ -67,23 +67,6 @@ // CHECK-BASIC-LIBSTDCXX-LIBCXXV2-SYSROOT: "-internal-isystem" "[[SYSROOT]]/usr/include/c++/v2" // CHECK-BASIC-LIBSTDCXX-LIBCXXV2-SYSROOT: "-internal-isystem" "[[SYSROOT]]/usr/local/include" // -// Test a very broken version of multiarch that shipped in Ubuntu 11.04. -// RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \ -// RUN: -target i386-unknown-linux -stdlib=libstdc++ \ -// RUN: --sysroot=%S/Inputs/ubuntu_11.04_multiarch_tree \ -// RUN: --gcc-toolchain="" \ -// RUN: | FileCheck --check-prefix=CHECK-UBUNTU-11-04 %s -// CHECK-UBUNTU-11-04: "{{.*}}clang{{.*}}" "-cc1" -// CHECK-UBUNTU-11-04: "-resource-dir" "[[RESOURCE_DIR:[^"]+]]" -// CHECK-UBUNTU-11-04: "-isysroot" "[[SYSROOT:[^"]+]]" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/i686-linux-gnu" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../../include/c++/4.5/backward" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[SYSROOT]]/usr/local/include" -// CHECK-UBUNTU-11-04: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include" -// CHECK-UBUNTU-11-04: "-internal-externc-isystem" "[[SYSROOT]]/include" -// CHECK-UBUNTU-11-04: "-internal-externc-isystem" "[[SYSROOT]]/usr/include" -// // RUN: %clang -no-canonical-prefixes %s -### -fsyntax-only 2>&1 \ // RUN: -target x86_64-unknown-linux-gnu -stdlib=libstdc++ \ // RUN: --sysroot=%S/Inputs/ubuntu_13.04_multiarch_tree \ diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c index 1aa955737438..8ba57a941443 100644 --- a/clang/test/Driver/linux-ld.c +++ b/clang/test/Driver/linux-ld.c @@ -565,21 +565,6 @@ // CHECK-BASIC-LIBCXX-C-LINK: "--sysroot=[[SYSROOT]]" // CHECK-BASIC-LIBCXX-C-LINK: "-L[[SYSROOT]]/usr/bin/../lib" // -// Test a very broken version of multiarch that shipped in Ubuntu 11.04. -// RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ -// RUN: --target=i386-unknown-linux -rtlib=platform \ -// RUN: --gcc-toolchain="" \ -// RUN: --sysroot=%S/Inputs/ubuntu_11.04_multiarch_tree \ -// RUN: | FileCheck --check-prefix=CHECK-UBUNTU-11-04 %s -// CHECK-UBUNTU-11-04: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-UBUNTU-11-04: "{{.*}}/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5{{/|\\\\}}crtbegin.o" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../../i386-linux-gnu" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu/gcc/i686-linux-gnu/4.5/../../../.." -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/lib" -// CHECK-UBUNTU-11-04: "-L[[SYSROOT]]/usr/lib" -// // Check multi arch support on Ubuntu 12.04 LTS. // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: --target=arm-unknown-linux-gnueabihf -rtlib=platform \ -- GitLab From 879760c245c898e759edab1d3318253080d79f6e Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 10:36:51 -0700 Subject: [PATCH 0256/1000] [VE] Fix types of multiclass template arguments in TableGen files There were not properly checked before `[TableGen] Improve handling of template arguments`. --- llvm/lib/Target/VE/VEInstrInfo.td | 75 ++++++++++++------------ llvm/lib/Target/VE/VEInstrPatternsVec.td | 8 +-- 2 files changed, 41 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index b6862cf7b30d..2f77daae7130 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -793,7 +793,7 @@ multiclass PFCHmopc> { let Constraints = "$dest = $sd", DisableEncoding = "$sd", mayStore=1, mayLoad = 1, hasSideEffects = 0 in multiclass RRCAStgmopc, RegisterClass RC, ValueType Ty, - Operand immOp, Operand MEM, Operand ADDR, + Operand immOp, Operand MEM, ComplexPattern ADDR, SDPatternOperator OpNode = null_frag> { def r : RRM { + RM torri, + RM torii, + RM tozri, + RM tozii> { def : Pat<(i64 (from ADDRrri:$addr)), (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (torri MEMrri:$addr), sub_i32)>; @@ -1748,10 +1748,10 @@ defm : EXT64m; // anyextload multiclass EXT32m { + RM torri, + RM torii, + RM tozri, + RM tozii> { def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>; def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>; def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>; @@ -1762,10 +1762,10 @@ defm : EXT32m; // truncstore multiclass TRUNC64m { + RM torri, + RM torii, + RM tozri, + RM tozii> { def : Pat<(from i64:$src, ADDRrri:$addr), (torri MEMrri:$addr, (EXTRACT_SUBREG $src, sub_i32))>; def : Pat<(from i64:$src, ADDRrii:$addr), @@ -1781,8 +1781,8 @@ defm : TRUNC64m; // Atomic loads multiclass ATMLDm { + RM torri, RM torii, + RM tozri, RM tozii> { def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>; def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>; def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>; @@ -1794,9 +1794,9 @@ defm : ATMLDm; defm : ATMLDm; // Optimized atomic loads with sext -multiclass SXATMLDm { +multiclass SXATMLDm { def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRrri:$addr))), TY)), (i2l (torri MEMrri:$addr))>; def : Pat<(i64 (sext_inreg (i64 (anyext (from ADDRrii:$addr))), TY)), @@ -1807,8 +1807,8 @@ multiclass SXATMLDm; } multiclass SXATMLD32m { + RM torri, RM torii, + RM tozri, RM tozii> { def : Pat<(i64 (sext (from ADDRrri:$addr))), (i2l (torri MEMrri:$addr))>; def : Pat<(i64 (sext (from ADDRrii:$addr))), @@ -1824,9 +1824,9 @@ defm : SXATMLDm; // Optimized atomic loads with zext -multiclass ZXATMLDm { +multiclass ZXATMLDm { def : Pat<(i64 (and (anyext (from ADDRrri:$addr)), VAL)), (i2l (torri MEMrri:$addr))>; def : Pat<(i64 (and (anyext (from ADDRrii:$addr)), VAL)), @@ -1836,9 +1836,9 @@ multiclass ZXATMLDm; } -multiclass ZXATMLD32m { +multiclass ZXATMLD32m { def : Pat<(i64 (zext (from ADDRrri:$addr))), (i2l (torri MEMrri:$addr))>; def : Pat<(i64 (zext (from ADDRrii:$addr))), @@ -1857,8 +1857,8 @@ defm : ZXATMLD32m { + RM torri, RM torii, + RM tozri, RM tozii> { def : Pat<(from ADDRrri:$addr, ty:$src), (torri MEMrri:$addr, $src)>; def : Pat<(from ADDRrii:$addr, ty:$src), (torii MEMrii:$addr, $src)>; def : Pat<(from ADDRzri:$addr, ty:$src), (tozri MEMzri:$addr, $src)>; @@ -1872,10 +1872,10 @@ defm : ATMSTm; // Optimized atomic stores with truncate multiclass TRATMSTm { + RM torri, + RM torii, + RM tozri, + RM tozii> { def : Pat<(from ADDRrri:$addr, (i32 (trunc i64:$src))), (torri MEMrri:$addr, (EXTRACT_SUBREG $src, sub_i32))>; def : Pat<(from ADDRrii:$addr, (i32 (trunc i64:$src))), @@ -1929,10 +1929,10 @@ def : Pat<(br bb:$addr), (BRCFLa bb:$addr)>; // brcc // integer brcc -multiclass BRCCIm { +multiclass BRCCIm { def : Pat<(brcc CCSIOp:$cond, ty:$l, simm7:$r, bb:$addr), (BrOpNode2 (icond2ccSwap $cond), (LO7 $r), $l, bb:$addr)>; def : Pat<(brcc CCSIOp:$cond, ty:$l, ty:$r, bb:$addr), @@ -1947,8 +1947,7 @@ defm : BRCCIm; defm : BRCCIm; // floating point brcc -multiclass BRCCFm { +multiclass BRCCFm { def : Pat<(brcc cond:$cond, ty:$l, simm7fp:$r, bb:$addr), (BrOpNode2 (fcond2ccSwap $cond), (LO7FP $r), $l, bb:$addr)>; def : Pat<(brcc cond:$cond, ty:$l, ty:$r, bb:$addr), diff --git a/llvm/lib/Target/VE/VEInstrPatternsVec.td b/llvm/lib/Target/VE/VEInstrPatternsVec.td index 0084876f9f1b..dc3c913c918a 100644 --- a/llvm/lib/Target/VE/VEInstrPatternsVec.td +++ b/llvm/lib/Target/VE/VEInstrPatternsVec.td @@ -16,7 +16,7 @@ //===----------------------------------------------------------------------===// multiclass vbrd_elem32 { + SDNodeXForm ImmCast, OutPatFrag SuperRegCast> { // VBRDil def : Pat<(v32 (vec_broadcast (s32 ImmOp:$sy), i32:$vl)), (VBRDil (ImmCast $sy), i32:$vl)>; @@ -38,8 +38,8 @@ multiclass vbrd_elem64 { + OutPatFrag SubRegCast, + OutPatFrag SuperRegCast> { // LVSvi def: Pat<(s32 (extractelt v32:$vec, uimm7:$idx)), (SubRegCast (LVSvi v32:$vec, (ULO7 $idx)))>; @@ -73,7 +73,7 @@ multiclass extract_insert_elem64 { multiclass patterns_elem32 { + OutPatFrag SubRegCast, OutPatFrag SuperRegCast> { defm : vbrd_elem32; defm : extract_insert_elem32; } -- GitLab From e92faa77b4b7e425fc29574c0273b3904ee2b0a6 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 11:06:44 -0700 Subject: [PATCH 0257/1000] [test] Fix Driver/gcc-toolchain.cpp if CLANG_DEFAULT_CXX_STDLIB is libc++ --- clang/test/Driver/gcc-toolchain.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp index 03a7991d6c70..8bb391f19eac 100644 --- a/clang/test/Driver/gcc-toolchain.cpp +++ b/clang/test/Driver/gcc-toolchain.cpp @@ -1,12 +1,12 @@ // Test that gcc-toolchain option is working correctly // -// RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -// RUN: --target=x86_64-linux-gnu --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr | \ +// RUN: %clangxx -no-canonical-prefixes %s -### -o %t --target=x86_64-linux-gnu \ +// RUN: --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr -stdlib=libstdc++ 2>&1 | \ // RUN: FileCheck %s // // Additionally check that the legacy spelling of the flag works. -// RUN: %clangxx -no-canonical-prefixes %s -### -o %t 2>&1 \ -// RUN: --target=x86_64-linux-gnu -gcc-toolchain %S/Inputs/ubuntu_14.04_multiarch_tree/usr | \ +// RUN: %clangxx -no-canonical-prefixes %s -### -o %t --target=x86_64-linux-gnu \ +// RUN: --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr -stdlib=libstdc++ 2>&1 | \ // RUN: FileCheck %s // // Test for header search toolchain detection. -- GitLab From 188405bc192df54fbf048ddd3da071c9fff4d0d1 Mon Sep 17 00:00:00 2001 From: Stephen Kelly Date: Wed, 17 Mar 2021 23:22:31 +0000 Subject: [PATCH 0258/1000] [AST] Ensure that an empty json file is generated if compile errors Differential Revision: https://reviews.llvm.org/D98827 --- .../Tooling/DumpTool/ASTSrcLocProcessor.cpp | 18 ++++++++++-------- .../lib/Tooling/DumpTool/ASTSrcLocProcessor.h | 1 + clang/lib/Tooling/DumpTool/ClangSrcLocDump.cpp | 8 +++++++- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.cpp b/clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.cpp index ff279d9425d8..e7400e958716 100644 --- a/clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.cpp +++ b/clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.cpp @@ -79,17 +79,16 @@ llvm::json::Object toJSON(llvm::StringMap const &Obj) { return JsonObj; } -void WriteJSON(std::string JsonPath, - llvm::StringMap const &ClassInheritance, - llvm::StringMap> const &ClassesInClade, - llvm::StringMap const &ClassEntries) { +void WriteJSON(std::string JsonPath, llvm::json::Object &&ClassInheritance, + llvm::json::Object &&ClassesInClade, + llvm::json::Object &&ClassEntries) { llvm::json::Object JsonObj; using llvm::json::toJSON; - JsonObj["classInheritance"] = ::toJSON(ClassInheritance); - JsonObj["classesInClade"] = ::toJSON(ClassesInClade); - JsonObj["classEntries"] = ::toJSON(ClassEntries); + JsonObj["classInheritance"] = std::move(ClassInheritance); + JsonObj["classesInClade"] = std::move(ClassesInClade); + JsonObj["classEntries"] = std::move(ClassEntries); std::error_code EC; llvm::raw_fd_ostream JsonOut(JsonPath, EC, llvm::sys::fs::F_Text); @@ -101,9 +100,12 @@ void WriteJSON(std::string JsonPath, } void ASTSrcLocProcessor::generate() { - WriteJSON(JsonPath, ClassInheritance, ClassesInClade, ClassEntries); + WriteJSON(JsonPath, ::toJSON(ClassInheritance), ::toJSON(ClassesInClade), + ::toJSON(ClassEntries)); } +void ASTSrcLocProcessor::generateEmpty() { WriteJSON(JsonPath, {}, {}, {}); } + std::vector CaptureMethods(std::string TypeString, const clang::CXXRecordDecl *ASTClass, const MatchFinder::MatchResult &Result) { diff --git a/clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.h b/clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.h index 00994758e03c..5d848f48ed54 100644 --- a/clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.h +++ b/clang/lib/Tooling/DumpTool/ASTSrcLocProcessor.h @@ -30,6 +30,7 @@ public: StringRef File); void generate(); + void generateEmpty(); private: void run(const ast_matchers::MatchFinder::MatchResult &Result) override; diff --git a/clang/lib/Tooling/DumpTool/ClangSrcLocDump.cpp b/clang/lib/Tooling/DumpTool/ClangSrcLocDump.cpp index 06b58c6382ed..8328977178cc 100644 --- a/clang/lib/Tooling/DumpTool/ClangSrcLocDump.cpp +++ b/clang/lib/Tooling/DumpTool/ClangSrcLocDump.cpp @@ -48,7 +48,13 @@ class ASTSrcLocGenerationAction : public clang::ASTFrontendAction { public: ASTSrcLocGenerationAction() : Processor(JsonOutputPath) {} - ~ASTSrcLocGenerationAction() { Processor.generate(); } + void ExecuteAction() override { + clang::ASTFrontendAction::ExecuteAction(); + if (getCompilerInstance().getDiagnostics().getNumErrors() > 0) + Processor.generateEmpty(); + else + Processor.generate(); + } std::unique_ptr CreateASTConsumer(clang::CompilerInstance &Compiler, -- GitLab From 47fdaa32f97d29ade52232ad8cb16227d195de6a Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Sat, 20 Mar 2021 01:03:50 -0400 Subject: [PATCH 0259/1000] [lld-macho] Minor touch-up to objc.s --- lld/test/MachO/objc.s | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lld/test/MachO/objc.s b/lld/test/MachO/objc.s index 06f47d2c3b78..dafee74796d0 100644 --- a/lld/test/MachO/objc.s +++ b/lld/test/MachO/objc.s @@ -32,7 +32,8 @@ # NO-OBJC-EMPTY: # NO-OBJC-NEXT: SYMBOL TABLE: # NO-OBJC-NEXT: g F __TEXT,__text _main -# NO_OBJC-NEXT: g *ABS* __mh_execute_header +# NO-OBJC-NEXT: g *ABS* __mh_execute_header +# NO-OBJC-EMPTY: #--- has-objc-symbol.s .globl _OBJC_CLASS_$_MyObject -- GitLab From ee8b53815ddf6f6f94ade0068903cd5ae843fafa Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Sat, 20 Mar 2021 14:45:56 -0400 Subject: [PATCH 0260/1000] [BranchProbability] move options for 'likely' and 'unlikely' This makes the settings available for use in other passes by housing them within the Support lib, but NFC otherwise. See D98898 for the proposed usage in SimplifyCFG (where this change was originally included). Differential Revision: https://reviews.llvm.org/D98945 --- clang/lib/CodeGen/CodeGenFunction.cpp | 2 +- llvm/include/llvm/Support/BranchProbability.h | 4 ++++ .../Transforms/Scalar/LowerExpectIntrinsic.h | 3 --- llvm/lib/Support/BranchProbability.cpp | 14 +++++++++++++ .../Scalar/LowerExpectIntrinsic.cpp | 20 +------------------ 5 files changed, 20 insertions(+), 23 deletions(-) diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index a00ae74fa165..18927b46958c 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -42,8 +42,8 @@ #include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Operator.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/CRC.h" -#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" using namespace clang; using namespace CodeGen; diff --git a/llvm/include/llvm/Support/BranchProbability.h b/llvm/include/llvm/Support/BranchProbability.h index 6c7ad1fe2a52..f977c70221a5 100644 --- a/llvm/include/llvm/Support/BranchProbability.h +++ b/llvm/include/llvm/Support/BranchProbability.h @@ -13,6 +13,7 @@ #ifndef LLVM_SUPPORT_BRANCHPROBABILITY_H #define LLVM_SUPPORT_BRANCHPROBABILITY_H +#include "llvm/Support/CommandLine.h" #include "llvm/Support/DataTypes.h" #include #include @@ -21,6 +22,9 @@ namespace llvm { +extern cl::opt LikelyBranchWeight; +extern cl::opt UnlikelyBranchWeight; + class raw_ostream; // This class represents Branch Probability as a non-negative fraction that is diff --git a/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h b/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h index 22b2e649e4d4..4e47ff70d557 100644 --- a/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h +++ b/llvm/include/llvm/Transforms/Scalar/LowerExpectIntrinsic.h @@ -17,7 +17,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/PassManager.h" -#include "llvm/Support/CommandLine.h" namespace llvm { @@ -32,8 +31,6 @@ struct LowerExpectIntrinsicPass : PassInfoMixin { PreservedAnalyses run(Function &F, FunctionAnalysisManager &); }; -extern cl::opt LikelyBranchWeight; -extern cl::opt UnlikelyBranchWeight; } #endif diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp index 60d5478a9052..d93d9cffb9f7 100644 --- a/llvm/lib/Support/BranchProbability.cpp +++ b/llvm/lib/Support/BranchProbability.cpp @@ -19,6 +19,20 @@ using namespace llvm; +// These default values are chosen to represent an extremely skewed outcome for +// a condition, but they leave some room for interpretation by later passes. +// +// If the documentation for __builtin_expect() was made explicit that it should +// only be used in extreme cases, we could make this ratio higher. As it stands, +// programmers may be using __builtin_expect() / llvm.expect to annotate that a +// branch is only mildly likely or unlikely to be taken. +cl::opt llvm::LikelyBranchWeight( + "likely-branch-weight", cl::Hidden, cl::init(2000), + cl::desc("Weight of the branch likely to be taken (default = 2000)")); +cl::opt llvm::UnlikelyBranchWeight( + "unlikely-branch-weight", cl::Hidden, cl::init(1), + cl::desc("Weight of the branch unlikely to be taken (default = 1)")); + constexpr uint32_t BranchProbability::D; raw_ostream &BranchProbability::print(raw_ostream &OS) const { diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index da13075dfee2..d862fcfe8ce5 100644 --- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -24,6 +24,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" @@ -34,25 +35,6 @@ using namespace llvm; STATISTIC(ExpectIntrinsicsHandled, "Number of 'expect' intrinsic instructions handled"); -// These default values are chosen to represent an extremely skewed outcome for -// a condition, but they leave some room for interpretation by later passes. -// -// If the documentation for __builtin_expect() was made explicit that it should -// only be used in extreme cases, we could make this ratio higher. As it stands, -// programmers may be using __builtin_expect() / llvm.expect to annotate that a -// branch is likely or unlikely to be taken. -// -// There is a known dependency on this ratio in CodeGenPrepare when transforming -// 'select' instructions. It may be worthwhile to hoist these values to some -// shared space, so they can be used directly by other passes. - -cl::opt llvm::LikelyBranchWeight( - "likely-branch-weight", cl::Hidden, cl::init(2000), - cl::desc("Weight of the branch likely to be taken (default = 2000)")); -cl::opt llvm::UnlikelyBranchWeight( - "unlikely-branch-weight", cl::Hidden, cl::init(1), - cl::desc("Weight of the branch unlikely to be taken (default = 1)")); - static std::tuple getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) { if (IntrinsicID == Intrinsic::expect) { -- GitLab From f628ba0b55b117dc68f9cb3be58189c05910660c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 13:24:49 -0700 Subject: [PATCH 0261/1000] [test] Fix Driver/gcc-toolchain.cpp if CLANG_DEFAULT_RTLIB is compiler-rt --- clang/test/Driver/gcc-toolchain.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp index 8bb391f19eac..4bd658315a44 100644 --- a/clang/test/Driver/gcc-toolchain.cpp +++ b/clang/test/Driver/gcc-toolchain.cpp @@ -1,12 +1,14 @@ // Test that gcc-toolchain option is working correctly // +/// Without --rtlib=libgcc the driver may pick clang_rt.crtbegin.o if +/// -DCLANG_DEFAULT_RTLIB=compiler-rt. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t --target=x86_64-linux-gnu \ -// RUN: --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr -stdlib=libstdc++ 2>&1 | \ +// RUN: --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr -stdlib=libstdc++ --rtlib=libgcc 2>&1 | \ // RUN: FileCheck %s // // Additionally check that the legacy spelling of the flag works. // RUN: %clangxx -no-canonical-prefixes %s -### -o %t --target=x86_64-linux-gnu \ -// RUN: --gcc-toolchain=%S/Inputs/ubuntu_14.04_multiarch_tree/usr -stdlib=libstdc++ 2>&1 | \ +// RUN: -gcc-toolchain %S/Inputs/ubuntu_14.04_multiarch_tree/usr -stdlib=libstdc++ --rtlib=libgcc 2>&1 | \ // RUN: FileCheck %s // // Test for header search toolchain detection. -- GitLab From 14696baaf4c43fe53f738bc292bbe169eed93d5d Mon Sep 17 00:00:00 2001 From: Jinsong Ji Date: Sat, 20 Mar 2021 03:48:48 +0000 Subject: [PATCH 0262/1000] [AIX] Update rpath for BUILD_SHARED_LIBS BUILD_SHARED_LIBS build llvm component as shared library, which can reduce the size a lot. Normally, the binary use ORIGIN../lib to load component libraries, unfortunatly, ORIGIN is not supported by AIX ld. We hardcoded the build lib and install lib path in rpath for now to enable BUILD_SHARED_LIBS build. Understand that this is not perfect solution, we can update this when we find better solution. Reviewed By: hubert.reinterpretcast Differential Revision: https://reviews.llvm.org/D98901 --- llvm/cmake/modules/AddLLVM.cmake | 6 ++++++ llvm/cmake/modules/HandleLLVMOptions.cmake | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index da1b78b4f530..2f055c779962 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -2105,6 +2105,12 @@ function(llvm_setup_rpath name) if (APPLE) set(_install_name_dir INSTALL_NAME_DIR "@rpath") set(_install_rpath "@loader_path/../lib${LLVM_LIBDIR_SUFFIX}" ${extra_libdir}) + elseif(${CMAKE_SYSTEM_NAME} MATCHES "AIX" AND BUILD_SHARED_LIBS) + # $ORIGIN is not interpreted at link time by aix ld. + # Since BUILD_SHARED_LIBS is only recommended for use by developers, + # hardcode the rpath to build/install lib dir first in this mode. + # FIXME: update this when there is better solution. + set(_install_rpath "${LLVM_LIBRARY_OUTPUT_INTDIR}" "${CMAKE_INSTALL_PREFIX}/lib${LLVM_LIBDIR_SUFFIX}" ${extra_libdir}) elseif(UNIX) set(_install_rpath "\$ORIGIN/../lib${LLVM_LIBDIR_SUFFIX}" ${extra_libdir}) if(${CMAKE_SYSTEM_NAME} MATCHES "(FreeBSD|DragonFly)") diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index c250a776517d..0c575b6608b0 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -212,6 +212,11 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") append("-bcdtors:mbr" CMAKE_EXE_LINKER_FLAGS CMAKE_MODULE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS) endif() + if(BUILD_SHARED_LIBS) + # See rpath handling in AddLLVM.cmake + # FIXME: Remove this warning if this rpath is no longer hardcoded. + message(WARNING "Build and install environment path info may be exposed; binaries will also be unrelocatable.") + endif() endif() # Pass -Wl,-z,defs. This makes sure all symbols are defined. Otherwise a DSO -- GitLab From 5155dff2784a47583d432d796b7cf47a0bed9f20 Mon Sep 17 00:00:00 2001 From: Andrew Litteken Date: Thu, 17 Sep 2020 15:43:40 -0500 Subject: [PATCH 0263/1000] [IRSim] Adding basic implementation of llvm-sim. This is a similarity visualization tool that accepts a Module and passes it to the IRSimilarityIdentifier. The resulting SimilarityGroups are output in a JSON file. Tests are found in test/tools/llvm-sim and check for the file not found, a bad module, and that the JSON is created correctly. Reviewers: paquette, jroelofs, MaskRay Recommit of: 15645d044bcfe2a0f63156048b302f997a717688 to fix linking errors. Differential Revision: https://reviews.llvm.org/D86974 --- llvm/test/CMakeLists.txt | 1 + llvm/test/lit.cfg.py | 2 +- llvm/test/tools/llvm-sim/Inputs/sim1.ll | 27 ++++ llvm/test/tools/llvm-sim/fail-cases.test | 8 + llvm/test/tools/llvm-sim/single-sim-file.test | 57 +++++++ llvm/test/tools/llvm-sim/single-sim.test | 56 +++++++ llvm/tools/llvm-sim/CMakeLists.txt | 9 ++ llvm/tools/llvm-sim/llvm-sim.cpp | 149 ++++++++++++++++++ 8 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 llvm/test/tools/llvm-sim/Inputs/sim1.ll create mode 100644 llvm/test/tools/llvm-sim/fail-cases.test create mode 100644 llvm/test/tools/llvm-sim/single-sim-file.test create mode 100644 llvm/test/tools/llvm-sim/single-sim.test create mode 100644 llvm/tools/llvm-sim/CMakeLists.txt create mode 100644 llvm/tools/llvm-sim/llvm-sim.cpp diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index 7c4fa2e9033a..0c72adca931b 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -109,6 +109,7 @@ set(LLVM_TEST_DEPENDS llvm-readelf llvm-reduce llvm-rtdyld + llvm-sim llvm-size llvm-split llvm-strings diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 2a1ccc2dcfbd..244d69e01cfc 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -162,7 +162,7 @@ tools.extend([ 'llvm-link', 'llvm-lto', 'llvm-lto2', 'llvm-mc', 'llvm-mca', 'llvm-modextract', 'llvm-nm', 'llvm-objcopy', 'llvm-objdump', 'llvm-pdbutil', 'llvm-profdata', 'llvm-ranlib', 'llvm-rc', 'llvm-readelf', - 'llvm-readobj', 'llvm-rtdyld', 'llvm-size', 'llvm-split', 'llvm-strings', + 'llvm-readobj', 'llvm-rtdyld', 'llvm-sim', 'llvm-size', 'llvm-split', 'llvm-strings', 'llvm-strip', 'llvm-tblgen', 'llvm-undname', 'llvm-c-test', 'llvm-cxxfilt', 'llvm-xray', 'yaml2obj', 'obj2yaml', 'yaml-bench', 'verify-uselistorder', 'bugpoint', 'llc', 'llvm-symbolizer', 'opt', 'sancov', 'sanstats']) diff --git a/llvm/test/tools/llvm-sim/Inputs/sim1.ll b/llvm/test/tools/llvm-sim/Inputs/sim1.ll new file mode 100644 index 000000000000..facc27d285b0 --- /dev/null +++ b/llvm/test/tools/llvm-sim/Inputs/sim1.ll @@ -0,0 +1,27 @@ +define void @similar_func1() { +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} + +define void @similar_func2() { +entry: + %a = alloca i32, align 4 + %b = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 2, i32* %a, align 4 + store i32 3, i32* %b, align 4 + store i32 4, i32* %c, align 4 + %al = load i32, i32* %a + %bl = load i32, i32* %b + %cl = load i32, i32* %c + ret void +} diff --git a/llvm/test/tools/llvm-sim/fail-cases.test b/llvm/test/tools/llvm-sim/fail-cases.test new file mode 100644 index 000000000000..41e3a5617acb --- /dev/null +++ b/llvm/test/tools/llvm-sim/fail-cases.test @@ -0,0 +1,8 @@ +# RUN: not llvm-sim %s 2>&1 | FileCheck %s +# RUN: not llvm-sim %s.2 2>&1 | FileCheck %s --check-prefix=EXIST + +# File reading error messaging tests. + +# CHECK: error: expected top-level entity + +# EXIST: error: Could not open input file: No such file or directory diff --git a/llvm/test/tools/llvm-sim/single-sim-file.test b/llvm/test/tools/llvm-sim/single-sim-file.test new file mode 100644 index 000000000000..5e45edf12c2c --- /dev/null +++ b/llvm/test/tools/llvm-sim/single-sim-file.test @@ -0,0 +1,57 @@ +# RUN: llvm-sim -o %t %S/Inputs/sim1.ll +# RUN: FileCheck %s < %t + +# Checking the output of a single module test. + +# CHECK: { +# CHECK-NEXT: "1": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 8, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 18, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ], +# CHECK-NEXT: "2": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 7, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 17, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ], +# CHECK-NEXT: "3": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 6, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 16, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ], +# CHECK-NEXT: "4": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 5, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 15, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ], +# CHECK-NEXT: "5": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 4, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 14, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ] +# CHECK-NEXT: } diff --git a/llvm/test/tools/llvm-sim/single-sim.test b/llvm/test/tools/llvm-sim/single-sim.test new file mode 100644 index 000000000000..4e04682e294e --- /dev/null +++ b/llvm/test/tools/llvm-sim/single-sim.test @@ -0,0 +1,56 @@ +# RUN: llvm-sim -o - %S/Inputs/sim1.ll | FileCheck %s + +# Checking the output of a single module test. + +# CHECK: { +# CHECK-NEXT: "1": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 8, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 18, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ], +# CHECK-NEXT: "2": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 7, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 17, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ], +# CHECK-NEXT: "3": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 6, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 16, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ], +# CHECK-NEXT: "4": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 5, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 15, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ], +# CHECK-NEXT: "5": [ +# CHECK-NEXT: { +# CHECK-NEXT: "start": 4, +# CHECK-NEXT: "end": 9 +# CHECK-NEXT: }, +# CHECK-NEXT: { +# CHECK-NEXT: "start": 14, +# CHECK-NEXT: "end": 19 +# CHECK-NEXT: } +# CHECK-NEXT: ] +# CHECK-NEXT: } diff --git a/llvm/tools/llvm-sim/CMakeLists.txt b/llvm/tools/llvm-sim/CMakeLists.txt new file mode 100644 index 000000000000..76299050392a --- /dev/null +++ b/llvm/tools/llvm-sim/CMakeLists.txt @@ -0,0 +1,9 @@ +set(LLVM_LINK_COMPONENTS + Core + Support + Analysis + IRReader) + +add_llvm_tool(llvm-sim + llvm-sim.cpp +) diff --git a/llvm/tools/llvm-sim/llvm-sim.cpp b/llvm/tools/llvm-sim/llvm-sim.cpp new file mode 100644 index 000000000000..26e370ff30f1 --- /dev/null +++ b/llvm/tools/llvm-sim/llvm-sim.cpp @@ -0,0 +1,149 @@ +//===-- llvm-sim.cpp - Find similar sections of programs -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This program finds similar sections of a Module, and exports them as a JSON +// file. +// +// To find similarities contained across multiple modules, please use llvm-link +// first to merge the modules. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/IRSimilarityIdentifier.h" +#include "llvm/IRReader/IRReader.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/SourceMgr.h" +#include "llvm/Support/ToolOutputFile.h" + +using namespace llvm; +using namespace IRSimilarity; + +static cl::opt OutputFilename("o", cl::desc("Output Filename"), + cl::init("-"), + cl::value_desc("filename")); + +static cl::opt InputSourceFile(cl::Positional, + cl::desc(""), + cl::init("-"), + cl::value_desc("filename")); + +/// Retrieve the unique number \p I was mapped to in parseBitcodeFile. +/// +/// \param I - The Instruction to find the instruction number for. +/// \param LLVMInstNum - The mapping of Instructions to their location in the +/// module represented by an unsigned integer. +/// \returns The instruction number for \p I if it exists. +Optional +getPositionInModule(const Instruction *I, + const DenseMap &LLVMInstNum) { + assert(I && "Instruction is nullptr!"); + DenseMap::const_iterator It = LLVMInstNum.find(I); + if (It == LLVMInstNum.end()) + return None; + return It->second; +} + +/// Exports the given SimilarityGroups to a JSON file at \p FilePath. +/// +/// \param FilePath - The path to the output location. +/// \param SimSections - The similarity groups to process. +/// \param LLVMInstNum - The mapping of Instructions to their location in the +/// module represented by an unsigned integer. +/// \returns A nonzero error code if there was a failure creating the file. +std::error_code +exportToFile(const StringRef FilePath, + const SimilarityGroupList &SimSections, + const DenseMap &LLVMInstNum) { + std::error_code EC; + std::unique_ptr Out( + new ToolOutputFile(FilePath, EC, sys::fs::OF_None)); + if (EC) + return EC; + + json::OStream J(Out->os(), 1); + J.objectBegin(); + + unsigned SimOption = 1; + // Process each list of SimilarityGroups organized by the Module. + for (const SimilarityGroup &G : SimSections) { + std::string SimOptionStr = std::to_string(SimOption); + J.attributeBegin(SimOptionStr); + J.arrayBegin(); + // For each file there is a list of the range where the similarity + // exists. + for (const IRSimilarityCandidate &C : G) { + Optional Start = + getPositionInModule((*C.front()).Inst, LLVMInstNum); + Optional End = + getPositionInModule((*C.back()).Inst, LLVMInstNum); + + assert(Start.hasValue() && + "Could not find instruction number for first instruction"); + assert(End.hasValue() && + "Could not find instruction number for last instruction"); + + J.object([&] { + J.attribute("start", Start.getValue()); + J.attribute("end", End.getValue()); + }); + } + J.arrayEnd(); + J.attributeEnd(); + SimOption++; + } + J.objectEnd(); + + Out->keep(); + + return EC; +} + +int main(int argc, const char *argv[]) { + InitLLVM X(argc, argv); + + cl::ParseCommandLineOptions(argc, argv, "LLVM IR Similarity Visualizer\n"); + + LLVMContext CurrContext; + SMDiagnostic Err; + std::unique_ptr ModuleToAnalyze = + parseIRFile(InputSourceFile, Err, CurrContext); + + if (!ModuleToAnalyze) { + Err.print(argv[0], errs()); + return 1; + } + + // Mapping from an Instruction pointer to its occurrence in a sequential + // list of all the Instructions in a Module. + DenseMap LLVMInstNum; + + // We give each instruction a number, which gives us a start and end value + // for the beginning and end of each IRSimilarityCandidate. + unsigned InstructionNumber = 1; + for (Function &F : *ModuleToAnalyze) + for (BasicBlock &BB : F) + for (Instruction &I : BB.instructionsWithoutDebug()) + LLVMInstNum[&I]= InstructionNumber++; + + // The similarity identifier we will use to find the similar sections. + IRSimilarityIdentifier SimIdent; + SimilarityGroupList SimilaritySections = + SimIdent.findSimilarity(*ModuleToAnalyze); + + std::error_code E = + exportToFile(OutputFilename, SimilaritySections, LLVMInstNum); + if (E) { + errs() << argv[0] << ": " << E.message() << '\n'; + return 2; + } + + return 0; +} -- GitLab From b0d8823a8a440549f303f9ba45aaa5550e1dc536 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 20 Mar 2021 12:34:06 -0700 Subject: [PATCH 0264/1000] [RISCV] Add isel pattern to optimize (mul (and X, 0xffffffff), (and Y, 0xffffffff)) on RV64 This patterns computes the full 64 bit product of a 32x32 unsigned multiply. This requires a two pairs of SLLI+SRLI to zero the upper 32 bits of the inputs. We can do better than this by using two SLLI to move the lower bits to the upper bits then use MULHU to compute the product. This is the high half of a full 64x64 product. Since we put 32 0s in the lower bits of the inputs we know the 128-bit product will have zeros in the lower 64 bits. So the upper 64 bits, which MULHU computes, will contain the original 64 bit product we were after. The same trick would work for (mul (sext_inreg X, i32), (sext_inreg Y, i32)) using MULHS, but sext_inreg is sext.w which is already one instruction so we wouldn't save anything. Differential Revision: https://reviews.llvm.org/D99026 --- llvm/lib/Target/RISCV/RISCVInstrInfoM.td | 9 +++++++++ .../CodeGen/RISCV/rv64i-w-insts-legalization.ll | 12 ++++-------- llvm/test/CodeGen/RISCV/xaluo.ll | 16 ++++------------ 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td index 2bfdc9312ebb..d38b5a98b31c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td @@ -93,4 +93,13 @@ def : Pat<(and (riscv_remuw (assertzexti32 GPR:$rs1), // produce a result where res[63:32]=0 and res[31]=1. def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))), (REMW GPR:$rs1, GPR:$rs2)>; + +// Special case for calculating the full 64-bit product of a 32x32 unsigned +// multiply where the inputs aren't known to be zero extended. We can shift the +// inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish +// zeroing the upper 32 bits. +// TODO: If one of the operands is zero extended and the other isn't, we might +// still be better off shifting both left by 32. +def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))), + (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>; } // Predicates = [HasStdExtM, IsRV64] diff --git a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll index c4a4de7681b0..682f351478ed 100644 --- a/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll +++ b/llvm/test/CodeGen/RISCV/rv64i-w-insts-legalization.ll @@ -10,13 +10,11 @@ define signext i32 @addw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: add a2, a2, a1 ; CHECK-NEXT: addi a3, a0, 1 ; CHECK-NEXT: mul a3, a2, a3 -; CHECK-NEXT: slli a2, a2, 32 -; CHECK-NEXT: srli a2, a2, 32 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: addi a1, a1, -2 ; CHECK-NEXT: slli a1, a1, 32 -; CHECK-NEXT: srli a1, a1, 32 -; CHECK-NEXT: mul a1, a2, a1 +; CHECK-NEXT: slli a2, a2, 32 +; CHECK-NEXT: mulhu a1, a2, a1 ; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: add a0, a3, a0 ; CHECK-NEXT: addw a0, a0, a1 @@ -57,13 +55,11 @@ define signext i32 @subw(i32 signext %s, i32 signext %n, i32 signext %k) nounwin ; CHECK-NEXT: not a2, a0 ; CHECK-NEXT: add a3, a2, a1 ; CHECK-NEXT: mul a2, a3, a2 -; CHECK-NEXT: slli a3, a3, 32 -; CHECK-NEXT: srli a3, a3, 32 ; CHECK-NEXT: sub a1, a1, a0 ; CHECK-NEXT: addi a1, a1, -2 ; CHECK-NEXT: slli a1, a1, 32 -; CHECK-NEXT: srli a1, a1, 32 -; CHECK-NEXT: mul a1, a3, a1 +; CHECK-NEXT: slli a3, a3, 32 +; CHECK-NEXT: mulhu a1, a3, a1 ; CHECK-NEXT: srli a1, a1, 1 ; CHECK-NEXT: sub a0, a2, a0 ; CHECK-NEXT: subw a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index e29cfc9156bc..facc0f2914b1 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -556,10 +556,8 @@ define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) { ; RV64-LABEL: umulo.i32: ; RV64: # %bb.0: # %entry ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: mul a1, a0, a1 +; RV64-NEXT: mulhu a1, a0, a1 ; RV64-NEXT: srli a0, a1, 32 ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a1, 0(a2) @@ -1297,10 +1295,8 @@ define i32 @umulo.select.i32(i32 %v1, i32 %v2) { ; RV64-LABEL: umulo.select.i32: ; RV64: # %bb.0: # %entry ; RV64-NEXT: slli a2, a1, 32 -; RV64-NEXT: srli a2, a2, 32 ; RV64-NEXT: slli a3, a0, 32 -; RV64-NEXT: srli a3, a3, 32 -; RV64-NEXT: mul a2, a3, a2 +; RV64-NEXT: mulhu a2, a3, a2 ; RV64-NEXT: srli a2, a2, 32 ; RV64-NEXT: bnez a2, .LBB42_2 ; RV64-NEXT: # %bb.1: # %entry @@ -1324,10 +1320,8 @@ define i1 @umulo.not.i32(i32 %v1, i32 %v2) { ; RV64-LABEL: umulo.not.i32: ; RV64: # %bb.0: # %entry ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: mulhu a0, a0, a1 ; RV64-NEXT: srli a0, a0, 32 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret @@ -1893,10 +1887,8 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { ; RV64-LABEL: umulo.br.i32: ; RV64: # %bb.0: # %entry ; RV64-NEXT: slli a1, a1, 32 -; RV64-NEXT: srli a1, a1, 32 ; RV64-NEXT: slli a0, a0, 32 -; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: mulhu a0, a0, a1 ; RV64-NEXT: srli a0, a0, 32 ; RV64-NEXT: beqz a0, .LBB57_2 ; RV64-NEXT: # %bb.1: # %overflow -- GitLab From 1fe1e996e987426e5d6352dabef358fc4ae619e5 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 15:24:02 -0700 Subject: [PATCH 0265/1000] [test] Delete "-internal-isystem" "/usr/local/include" --- clang/test/Driver/gcc-toolchain.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/clang/test/Driver/gcc-toolchain.cpp b/clang/test/Driver/gcc-toolchain.cpp index 4bd658315a44..fa256bec2b9a 100644 --- a/clang/test/Driver/gcc-toolchain.cpp +++ b/clang/test/Driver/gcc-toolchain.cpp @@ -18,7 +18,6 @@ // CHECK: "[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/x86_64-linux-gnu/c++/4.8" // CHECK: "-internal-isystem" // CHECK: "[[TOOLCHAIN]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../include/c++/4.8/backward" -// CHECK: "-internal-isystem" "/usr/local/include" // // Test for linker toolchain detection. Note that only the '-L' flags will use // the same precise formatting of the path as the '-internal-system' flags -- GitLab From 0874281d6054d8f5645bb066271b6f73acde7e80 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 20 Mar 2021 15:09:15 -0700 Subject: [PATCH 0266/1000] [RISCV] Add Zba command lines to xaluo.ll. NFC Some of the patterns end up with 32 to 64 bit zero extends on RV64 which can be handled by zext.w. --- llvm/test/CodeGen/RISCV/xaluo.ll | 1461 ++++++++++++++++++++++++++++++ 1 file changed, 1461 insertions(+) diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index facc0f2914b1..758cf4c41801 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -1,6 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=riscv32 -mattr=+m -verify-machineinstrs | FileCheck %s --check-prefix=RV32 ; RUN: llc < %s -mtriple=riscv64 -mattr=+m -verify-machineinstrs | FileCheck %s --check-prefix=RV64 +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+experimental-zba -verify-machineinstrs | FileCheck %s --check-prefix=RV32ZBA +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+experimental-zba -verify-machineinstrs | FileCheck %s --check-prefix=RV64ZBA ; ; Get the actual value of the overflow bit. @@ -25,6 +27,26 @@ define zeroext i1 @saddo1.i32(i32 %v1, i32 %v2, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a3, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo1.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a3, a0, a1 +; RV32ZBA-NEXT: slt a0, a3, a0 +; RV32ZBA-NEXT: slti a1, a1, 0 +; RV32ZBA-NEXT: xor a0, a1, a0 +; RV32ZBA-NEXT: sw a3, 0(a2) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo1.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: add a3, a0, a1 +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a3 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a3, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -51,6 +73,23 @@ define zeroext i1 @saddo2.i32(i32 %v1, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a2, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo2.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a2, a0, 4 +; RV32ZBA-NEXT: slt a0, a2, a0 +; RV32ZBA-NEXT: sw a2, 0(a1) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo2.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: addi a2, a0, 4 +; RV64ZBA-NEXT: addiw a0, a0, 4 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4) %val = extractvalue {i32, i1} %t, 0 @@ -78,6 +117,24 @@ define zeroext i1 @saddo3.i32(i32 %v1, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a2, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo3.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a2, a0, -4 +; RV32ZBA-NEXT: slt a0, a2, a0 +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: sw a2, 0(a1) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo3.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: addi a2, a0, -4 +; RV64ZBA-NEXT: addiw a0, a0, -4 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4) %val = extractvalue {i32, i1} %t, 0 @@ -108,6 +165,27 @@ define zeroext i1 @saddo4.i32(i32 %v1, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a3, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo4.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: lui a2, 4096 +; RV32ZBA-NEXT: addi a2, a2, -1 +; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: slt a0, a2, a0 +; RV32ZBA-NEXT: sw a2, 0(a1) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo4.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: lui a2, 4096 +; RV64ZBA-NEXT: addiw a2, a2, -1 +; RV64ZBA-NEXT: add a3, a0, a2 +; RV64ZBA-NEXT: addw a0, a0, a2 +; RV64ZBA-NEXT: xor a0, a0, a3 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a3, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215) %val = extractvalue {i32, i1} %t, 0 @@ -140,6 +218,30 @@ define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, i64* %res) { ; RV64-NEXT: xor a0, a1, a0 ; RV64-NEXT: sd a3, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo1.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a5, a1, a3 +; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: sltu a0, a2, a0 +; RV32ZBA-NEXT: add a5, a5, a0 +; RV32ZBA-NEXT: xor a0, a1, a5 +; RV32ZBA-NEXT: xor a1, a1, a3 +; RV32ZBA-NEXT: not a1, a1 +; RV32ZBA-NEXT: and a0, a1, a0 +; RV32ZBA-NEXT: slti a0, a0, 0 +; RV32ZBA-NEXT: sw a2, 0(a4) +; RV32ZBA-NEXT: sw a5, 4(a4) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo1.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a3, a0, a1 +; RV64ZBA-NEXT: slt a0, a3, a0 +; RV64ZBA-NEXT: slti a1, a1, 0 +; RV64ZBA-NEXT: xor a0, a1, a0 +; RV64ZBA-NEXT: sd a3, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -168,6 +270,26 @@ define zeroext i1 @saddo2.i64(i64 %v1, i64* %res) { ; RV64-NEXT: slt a0, a2, a0 ; RV64-NEXT: sd a2, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo2.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a3, a0, 4 +; RV32ZBA-NEXT: sltu a0, a3, a0 +; RV32ZBA-NEXT: add a4, a1, a0 +; RV32ZBA-NEXT: xor a0, a1, a4 +; RV32ZBA-NEXT: not a1, a1 +; RV32ZBA-NEXT: and a0, a1, a0 +; RV32ZBA-NEXT: slti a0, a0, 0 +; RV32ZBA-NEXT: sw a3, 0(a2) +; RV32ZBA-NEXT: sw a4, 4(a2) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo2.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, a0, 4 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4) %val = extractvalue {i64, i1} %t, 0 @@ -197,6 +319,27 @@ define zeroext i1 @saddo3.i64(i64 %v1, i64* %res) { ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: sd a2, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo3.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a3, a0, -4 +; RV32ZBA-NEXT: sltu a0, a3, a0 +; RV32ZBA-NEXT: add a0, a1, a0 +; RV32ZBA-NEXT: addi a4, a0, -1 +; RV32ZBA-NEXT: xor a0, a1, a4 +; RV32ZBA-NEXT: and a0, a1, a0 +; RV32ZBA-NEXT: slti a0, a0, 0 +; RV32ZBA-NEXT: sw a3, 0(a2) +; RV32ZBA-NEXT: sw a4, 4(a2) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo3.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, a0, -4 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4) %val = extractvalue {i64, i1} %t, 0 @@ -222,6 +365,23 @@ define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) { ; RV64-NEXT: sw a0, 0(a2) ; RV64-NEXT: mv a0, a3 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: uaddo.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a1, a0, a1 +; RV32ZBA-NEXT: sltu a0, a1, a0 +; RV32ZBA-NEXT: sw a1, 0(a2) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a3, a0, a1 +; RV64ZBA-NEXT: sext.w a4, a0 +; RV64ZBA-NEXT: sltu a3, a3, a4 +; RV64ZBA-NEXT: add a0, a0, a1 +; RV64ZBA-NEXT: sw a0, 0(a2) +; RV64ZBA-NEXT: mv a0, a3 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -251,6 +411,27 @@ define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV64-NEXT: sltu a0, a1, a0 ; RV64-NEXT: sd a1, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: uaddo.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a3, a1, a3 +; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: sltu a0, a2, a0 +; RV32ZBA-NEXT: add a3, a3, a0 +; RV32ZBA-NEXT: beq a3, a1, .LBB8_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: sltu a0, a3, a1 +; RV32ZBA-NEXT: .LBB8_2: # %entry +; RV32ZBA-NEXT: sw a2, 0(a4) +; RV32ZBA-NEXT: sw a3, 4(a4) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a1, a0 +; RV64ZBA-NEXT: sd a1, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -279,6 +460,26 @@ define zeroext i1 @ssubo1.i32(i32 %v1, i32 %v2, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a3, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssubo1.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sgtz a3, a1 +; RV32ZBA-NEXT: sub a1, a0, a1 +; RV32ZBA-NEXT: slt a0, a1, a0 +; RV32ZBA-NEXT: xor a0, a3, a0 +; RV32ZBA-NEXT: sw a1, 0(a2) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssubo1.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: sub a3, a0, a1 +; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a3 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a3, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -304,6 +505,23 @@ define zeroext i1 @ssubo2.i32(i32 %v1, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a2, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssubo2.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a2, a0, 4 +; RV32ZBA-NEXT: slt a0, a2, a0 +; RV32ZBA-NEXT: sw a2, 0(a1) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssubo2.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: addi a2, a0, 4 +; RV64ZBA-NEXT: addiw a0, a0, 4 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4) %val = extractvalue {i32, i1} %t, 0 @@ -336,6 +554,30 @@ define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV64-NEXT: xor a0, a3, a0 ; RV64-NEXT: sd a1, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssubo.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sltu a6, a0, a2 +; RV32ZBA-NEXT: sub a5, a1, a3 +; RV32ZBA-NEXT: sub a5, a5, a6 +; RV32ZBA-NEXT: xor a6, a1, a5 +; RV32ZBA-NEXT: xor a1, a1, a3 +; RV32ZBA-NEXT: and a1, a1, a6 +; RV32ZBA-NEXT: slti a1, a1, 0 +; RV32ZBA-NEXT: sub a0, a0, a2 +; RV32ZBA-NEXT: sw a0, 0(a4) +; RV32ZBA-NEXT: sw a5, 4(a4) +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sgtz a3, a1 +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: xor a0, a3, a0 +; RV64ZBA-NEXT: sd a1, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -361,6 +603,23 @@ define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) { ; RV64-NEXT: sw a0, 0(a2) ; RV64-NEXT: mv a0, a3 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: usubo.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sub a1, a0, a1 +; RV32ZBA-NEXT: sltu a0, a0, a1 +; RV32ZBA-NEXT: sw a1, 0(a2) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: usubo.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a3, a0, a1 +; RV64ZBA-NEXT: sext.w a4, a0 +; RV64ZBA-NEXT: sltu a3, a4, a3 +; RV64ZBA-NEXT: sub a0, a0, a1 +; RV64ZBA-NEXT: sw a0, 0(a2) +; RV64ZBA-NEXT: mv a0, a3 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -393,6 +652,30 @@ define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: sd a1, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: usubo.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sltu a5, a0, a2 +; RV32ZBA-NEXT: sub a3, a1, a3 +; RV32ZBA-NEXT: sub a3, a3, a5 +; RV32ZBA-NEXT: sub a2, a0, a2 +; RV32ZBA-NEXT: beq a3, a1, .LBB13_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: sltu a0, a1, a3 +; RV32ZBA-NEXT: j .LBB13_3 +; RV32ZBA-NEXT: .LBB13_2: +; RV32ZBA-NEXT: sltu a0, a0, a2 +; RV32ZBA-NEXT: .LBB13_3: # %entry +; RV32ZBA-NEXT: sw a2, 0(a4) +; RV32ZBA-NEXT: sw a3, 4(a4) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: usubo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a0, a1 +; RV64ZBA-NEXT: sd a1, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -422,6 +705,27 @@ define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a3, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mulh a3, a0, a1 +; RV32ZBA-NEXT: mul a1, a0, a1 +; RV32ZBA-NEXT: srai a0, a1, 31 +; RV32ZBA-NEXT: xor a0, a3, a0 +; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: sw a1, 0(a2) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: mul a3, a0, a1 +; RV64ZBA-NEXT: mulw a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a3 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a3, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -452,6 +756,28 @@ define zeroext i1 @smulo2.i32(i32 %v1, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a3, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo2.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a2, zero, 13 +; RV32ZBA-NEXT: mulh a3, a0, a2 +; RV32ZBA-NEXT: mul a2, a0, a2 +; RV32ZBA-NEXT: srai a0, a2, 31 +; RV32ZBA-NEXT: xor a0, a3, a0 +; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: sw a2, 0(a1) +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo2.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: addi a2, zero, 13 +; RV64ZBA-NEXT: mul a3, a0, a2 +; RV64ZBA-NEXT: mulw a0, a0, a2 +; RV64ZBA-NEXT: xor a0, a0, a3 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a3, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 13) %val = extractvalue {i32, i1} %t, 0 @@ -492,6 +818,38 @@ define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sd a1, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi sp, sp, -16 +; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 +; RV32ZBA-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: .cfi_offset ra, -4 +; RV32ZBA-NEXT: .cfi_offset s0, -8 +; RV32ZBA-NEXT: mv s0, a4 +; RV32ZBA-NEXT: sw zero, 4(sp) +; RV32ZBA-NEXT: addi a4, sp, 4 +; RV32ZBA-NEXT: call __mulodi4@plt +; RV32ZBA-NEXT: lw a2, 4(sp) +; RV32ZBA-NEXT: snez a2, a2 +; RV32ZBA-NEXT: sw a1, 4(s0) +; RV32ZBA-NEXT: sw a0, 0(s0) +; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: addi sp, sp, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulh a3, a0, a1 +; RV64ZBA-NEXT: mul a1, a0, a1 +; RV64ZBA-NEXT: srai a0, a1, 63 +; RV64ZBA-NEXT: xor a0, a3, a0 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sd a1, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -535,6 +893,41 @@ define zeroext i1 @smulo2.i64(i64 %v1, i64* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sd a2, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo2.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi sp, sp, -16 +; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 +; RV32ZBA-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s0, 8(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: .cfi_offset ra, -4 +; RV32ZBA-NEXT: .cfi_offset s0, -8 +; RV32ZBA-NEXT: mv s0, a2 +; RV32ZBA-NEXT: sw zero, 4(sp) +; RV32ZBA-NEXT: addi a2, zero, 13 +; RV32ZBA-NEXT: addi a4, sp, 4 +; RV32ZBA-NEXT: mv a3, zero +; RV32ZBA-NEXT: call __mulodi4@plt +; RV32ZBA-NEXT: lw a2, 4(sp) +; RV32ZBA-NEXT: snez a2, a2 +; RV32ZBA-NEXT: sw a1, 4(s0) +; RV32ZBA-NEXT: sw a0, 0(s0) +; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: lw s0, 8(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: addi sp, sp, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo2.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a2, zero, 13 +; RV64ZBA-NEXT: mulh a3, a0, a2 +; RV64ZBA-NEXT: mul a2, a0, a2 +; RV64ZBA-NEXT: srai a0, a2, 63 +; RV64ZBA-NEXT: xor a0, a3, a0 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sd a2, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 13) %val = extractvalue {i64, i1} %t, 0 @@ -562,6 +955,25 @@ define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a1, 0(a2) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mulhu a3, a0, a1 +; RV32ZBA-NEXT: snez a3, a3 +; RV32ZBA-NEXT: mul a0, a0, a1 +; RV32ZBA-NEXT: sw a0, 0(a2) +; RV32ZBA-NEXT: mv a0, a3 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: slli a1, a1, 32 +; RV64ZBA-NEXT: slli a0, a0, 32 +; RV64ZBA-NEXT: mulhu a1, a0, a1 +; RV64ZBA-NEXT: srli a0, a1, 32 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a1, 0(a2) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -591,6 +1003,26 @@ define zeroext i1 @umulo2.i32(i32 %v1, i32* %res) { ; RV64-NEXT: snez a0, a0 ; RV64-NEXT: sw a2, 0(a1) ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo2.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a3, zero, 13 +; RV32ZBA-NEXT: mulhu a2, a0, a3 +; RV32ZBA-NEXT: snez a2, a2 +; RV32ZBA-NEXT: mul a0, a0, a3 +; RV32ZBA-NEXT: sw a0, 0(a1) +; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo2.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: addi a2, zero, 13 +; RV64ZBA-NEXT: mul a2, a0, a2 +; RV64ZBA-NEXT: srli a0, a2, 32 +; RV64ZBA-NEXT: snez a0, a0 +; RV64ZBA-NEXT: sw a2, 0(a1) +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 13) %val = extractvalue {i32, i1} %t, 0 @@ -632,6 +1064,39 @@ define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV64-NEXT: sd a0, 0(a2) ; RV64-NEXT: mv a0, a3 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mul a6, a3, a0 +; RV32ZBA-NEXT: mul a5, a1, a2 +; RV32ZBA-NEXT: add a6, a5, a6 +; RV32ZBA-NEXT: mulhu a5, a0, a2 +; RV32ZBA-NEXT: add a6, a5, a6 +; RV32ZBA-NEXT: sltu a7, a6, a5 +; RV32ZBA-NEXT: snez t0, a3 +; RV32ZBA-NEXT: snez a5, a1 +; RV32ZBA-NEXT: and a5, a5, t0 +; RV32ZBA-NEXT: mulhu a1, a1, a2 +; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: or a1, a5, a1 +; RV32ZBA-NEXT: mulhu a3, a3, a0 +; RV32ZBA-NEXT: snez a3, a3 +; RV32ZBA-NEXT: or a1, a1, a3 +; RV32ZBA-NEXT: or a1, a1, a7 +; RV32ZBA-NEXT: mul a0, a0, a2 +; RV32ZBA-NEXT: sw a0, 0(a4) +; RV32ZBA-NEXT: sw a6, 4(a4) +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulhu a3, a0, a1 +; RV64ZBA-NEXT: snez a3, a3 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: sd a0, 0(a2) +; RV64ZBA-NEXT: mv a0, a3 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -666,6 +1131,32 @@ define zeroext i1 @umulo2.i64(i64 %v1, i64* %res) { ; RV64-NEXT: sd a0, 0(a1) ; RV64-NEXT: mv a0, a2 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo2.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi a3, zero, 13 +; RV32ZBA-NEXT: mul a4, a1, a3 +; RV32ZBA-NEXT: mulhu a5, a0, a3 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: sltu a5, a4, a5 +; RV32ZBA-NEXT: mulhu a1, a1, a3 +; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: or a1, a1, a5 +; RV32ZBA-NEXT: mul a0, a0, a3 +; RV32ZBA-NEXT: sw a0, 0(a2) +; RV32ZBA-NEXT: sw a4, 4(a2) +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo2.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a3, zero, 13 +; RV64ZBA-NEXT: mulhu a2, a0, a3 +; RV64ZBA-NEXT: snez a2, a2 +; RV64ZBA-NEXT: mul a0, a0, a3 +; RV64ZBA-NEXT: sd a0, 0(a1) +; RV64ZBA-NEXT: mv a0, a2 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 13) %val = extractvalue {i64, i1} %t, 0 @@ -701,6 +1192,29 @@ define i32 @saddo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB22_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo.select.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a2, a0, a1 +; RV32ZBA-NEXT: slt a2, a2, a0 +; RV32ZBA-NEXT: slti a3, a1, 0 +; RV32ZBA-NEXT: bne a3, a2, .LBB22_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: .LBB22_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a2, a1 +; RV64ZBA-NEXT: sext.w a3, a0 +; RV64ZBA-NEXT: add a4, a3, a2 +; RV64ZBA-NEXT: addw a2, a3, a2 +; RV64ZBA-NEXT: bne a2, a4, .LBB22_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB22_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -727,6 +1241,25 @@ define i1 @saddo.not.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: xor a0, a0, a2 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo.not.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a2, a0, a1 +; RV32ZBA-NEXT: slt a0, a2, a0 +; RV32ZBA-NEXT: slti a1, a1, 0 +; RV32ZBA-NEXT: xor a0, a1, a0 +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -762,6 +1295,34 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB24_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo.select.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a4, a1, a3 +; RV32ZBA-NEXT: add a5, a0, a2 +; RV32ZBA-NEXT: sltu a5, a5, a0 +; RV32ZBA-NEXT: add a4, a4, a5 +; RV32ZBA-NEXT: xor a4, a1, a4 +; RV32ZBA-NEXT: xor a5, a1, a3 +; RV32ZBA-NEXT: not a5, a5 +; RV32ZBA-NEXT: and a4, a5, a4 +; RV32ZBA-NEXT: bltz a4, .LBB24_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: mv a1, a3 +; RV32ZBA-NEXT: .LBB24_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: slt a2, a2, a0 +; RV64ZBA-NEXT: slti a3, a1, 0 +; RV64ZBA-NEXT: bne a3, a2, .LBB24_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB24_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -792,6 +1353,29 @@ define i1 @saddo.not.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: xor a0, a1, a0 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo.not.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a4, a1, a3 +; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: sltu a0, a2, a0 +; RV32ZBA-NEXT: add a0, a4, a0 +; RV32ZBA-NEXT: xor a0, a1, a0 +; RV32ZBA-NEXT: xor a1, a1, a3 +; RV32ZBA-NEXT: not a1, a1 +; RV32ZBA-NEXT: and a0, a1, a0 +; RV32ZBA-NEXT: addi a1, zero, -1 +; RV32ZBA-NEXT: slt a0, a1, a0 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: slti a1, a1, 0 +; RV64ZBA-NEXT: xor a0, a1, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -818,6 +1402,25 @@ define i32 @uaddo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB26_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: uaddo.select.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a2, a0, a1 +; RV32ZBA-NEXT: bltu a2, a0, .LBB26_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: .LBB26_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a2, a0, a1 +; RV64ZBA-NEXT: sext.w a3, a0 +; RV64ZBA-NEXT: bltu a2, a3, .LBB26_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB26_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -840,6 +1443,21 @@ define i1 @uaddo.not.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: sltu a0, a1, a0 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: uaddo.not.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a1, a0, a1 +; RV32ZBA-NEXT: sltu a0, a1, a0 +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a1, a0, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: sltu a0, a1, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -875,6 +1493,34 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB28_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: uaddo.select.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a5, a1, a3 +; RV32ZBA-NEXT: add a4, a0, a2 +; RV32ZBA-NEXT: sltu a4, a4, a0 +; RV32ZBA-NEXT: add a5, a5, a4 +; RV32ZBA-NEXT: bne a5, a1, .LBB28_3 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: beqz a4, .LBB28_4 +; RV32ZBA-NEXT: .LBB28_2: # %entry +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB28_3: # %entry +; RV32ZBA-NEXT: sltu a4, a5, a1 +; RV32ZBA-NEXT: bnez a4, .LBB28_2 +; RV32ZBA-NEXT: .LBB28_4: # %entry +; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: mv a1, a3 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: bltu a2, a0, .LBB28_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB28_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -902,6 +1548,26 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: sltu a0, a1, a0 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: uaddo.not.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a3, a1, a3 +; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: sltu a0, a2, a0 +; RV32ZBA-NEXT: add a2, a3, a0 +; RV32ZBA-NEXT: beq a2, a1, .LBB29_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: sltu a0, a2, a1 +; RV32ZBA-NEXT: .LBB29_2: # %entry +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a1, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -932,6 +1598,29 @@ define i32 @ssubo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB30_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssubo.select.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sgtz a2, a1 +; RV32ZBA-NEXT: sub a3, a0, a1 +; RV32ZBA-NEXT: slt a3, a3, a0 +; RV32ZBA-NEXT: bne a2, a3, .LBB30_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: .LBB30_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a2, a1 +; RV64ZBA-NEXT: sext.w a3, a0 +; RV64ZBA-NEXT: sub a4, a3, a2 +; RV64ZBA-NEXT: subw a2, a3, a2 +; RV64ZBA-NEXT: bne a2, a4, .LBB30_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB30_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -958,6 +1647,25 @@ define i1 @ssubo.not.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: xor a0, a0, a2 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssubo.not.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sgtz a2, a1 +; RV32ZBA-NEXT: sub a1, a0, a1 +; RV32ZBA-NEXT: slt a0, a1, a0 +; RV32ZBA-NEXT: xor a0, a2, a0 +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: sub a2, a0, a1 +; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -991,6 +1699,32 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB32_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssubo.select.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sltu a4, a0, a2 +; RV32ZBA-NEXT: sub a5, a1, a3 +; RV32ZBA-NEXT: sub a4, a5, a4 +; RV32ZBA-NEXT: xor a4, a1, a4 +; RV32ZBA-NEXT: xor a5, a1, a3 +; RV32ZBA-NEXT: and a4, a5, a4 +; RV32ZBA-NEXT: bltz a4, .LBB32_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: mv a1, a3 +; RV32ZBA-NEXT: .LBB32_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sgtz a2, a1 +; RV64ZBA-NEXT: sub a3, a0, a1 +; RV64ZBA-NEXT: slt a3, a3, a0 +; RV64ZBA-NEXT: bne a2, a3, .LBB32_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB32_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -1019,6 +1753,27 @@ define i1 @ssub.not.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: xor a0, a2, a0 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssub.not.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sltu a0, a0, a2 +; RV32ZBA-NEXT: sub a2, a1, a3 +; RV32ZBA-NEXT: sub a0, a2, a0 +; RV32ZBA-NEXT: xor a0, a1, a0 +; RV32ZBA-NEXT: xor a1, a1, a3 +; RV32ZBA-NEXT: and a0, a1, a0 +; RV32ZBA-NEXT: addi a1, zero, -1 +; RV32ZBA-NEXT: slt a0, a1, a0 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssub.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sgtz a2, a1 +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: xor a0, a2, a0 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -1045,6 +1800,25 @@ define i32 @usubo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB34_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: usubo.select.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sub a2, a0, a1 +; RV32ZBA-NEXT: bltu a0, a2, .LBB34_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: .LBB34_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: usubo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a2, a0, a1 +; RV64ZBA-NEXT: sext.w a3, a0 +; RV64ZBA-NEXT: bltu a3, a2, .LBB34_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB34_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -1067,6 +1841,21 @@ define i1 @usubo.not.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: usubo.not.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sub a1, a0, a1 +; RV32ZBA-NEXT: sltu a0, a0, a1 +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: usubo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a1, a0, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: sltu a0, a0, a1 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -1103,6 +1892,35 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB36_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: usubo.select.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sltu a4, a0, a2 +; RV32ZBA-NEXT: sub a5, a1, a3 +; RV32ZBA-NEXT: sub a4, a5, a4 +; RV32ZBA-NEXT: beq a4, a1, .LBB36_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: sltu a4, a1, a4 +; RV32ZBA-NEXT: beqz a4, .LBB36_3 +; RV32ZBA-NEXT: j .LBB36_4 +; RV32ZBA-NEXT: .LBB36_2: +; RV32ZBA-NEXT: sub a4, a0, a2 +; RV32ZBA-NEXT: sltu a4, a0, a4 +; RV32ZBA-NEXT: bnez a4, .LBB36_4 +; RV32ZBA-NEXT: .LBB36_3: # %entry +; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: mv a1, a3 +; RV32ZBA-NEXT: .LBB36_4: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: usubo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sub a2, a0, a1 +; RV64ZBA-NEXT: bltu a0, a2, .LBB36_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB36_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -1133,6 +1951,29 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: sltu a0, a0, a1 ; RV64-NEXT: xori a0, a0, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: usubo.not.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sltu a4, a0, a2 +; RV32ZBA-NEXT: sub a3, a1, a3 +; RV32ZBA-NEXT: sub a3, a3, a4 +; RV32ZBA-NEXT: beq a3, a1, .LBB37_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: sltu a0, a1, a3 +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB37_2: +; RV32ZBA-NEXT: sub a1, a0, a2 +; RV32ZBA-NEXT: sltu a0, a0, a1 +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: usubo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: sltu a0, a0, a1 +; RV64ZBA-NEXT: xori a0, a0, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -1163,6 +2004,29 @@ define i32 @smulo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB38_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo.select.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mulh a2, a0, a1 +; RV32ZBA-NEXT: mul a3, a0, a1 +; RV32ZBA-NEXT: srai a3, a3, 31 +; RV32ZBA-NEXT: bne a2, a3, .LBB38_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: .LBB38_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a2, a1 +; RV64ZBA-NEXT: sext.w a3, a0 +; RV64ZBA-NEXT: mul a4, a3, a2 +; RV64ZBA-NEXT: mulw a2, a3, a2 +; RV64ZBA-NEXT: bne a2, a4, .LBB38_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB38_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -1189,6 +2053,25 @@ define i1 @smulo.not.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: xor a0, a0, a2 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo.not.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mulh a2, a0, a1 +; RV32ZBA-NEXT: mul a0, a0, a1 +; RV32ZBA-NEXT: srai a0, a0, 31 +; RV32ZBA-NEXT: xor a0, a2, a0 +; RV32ZBA-NEXT: seqz a0, a0 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: mul a2, a0, a1 +; RV64ZBA-NEXT: mulw a0, a0, a1 +; RV64ZBA-NEXT: xor a0, a0, a2 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -1244,6 +2127,54 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB40_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo.select.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi sp, sp, -32 +; RV32ZBA-NEXT: .cfi_def_cfa_offset 32 +; RV32ZBA-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: sw s3, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: .cfi_offset ra, -4 +; RV32ZBA-NEXT: .cfi_offset s0, -8 +; RV32ZBA-NEXT: .cfi_offset s1, -12 +; RV32ZBA-NEXT: .cfi_offset s2, -16 +; RV32ZBA-NEXT: .cfi_offset s3, -20 +; RV32ZBA-NEXT: mv s2, a3 +; RV32ZBA-NEXT: mv s3, a2 +; RV32ZBA-NEXT: mv s0, a1 +; RV32ZBA-NEXT: mv s1, a0 +; RV32ZBA-NEXT: sw zero, 8(sp) +; RV32ZBA-NEXT: addi a4, sp, 8 +; RV32ZBA-NEXT: call __mulodi4@plt +; RV32ZBA-NEXT: lw a0, 8(sp) +; RV32ZBA-NEXT: bnez a0, .LBB40_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv s1, s3 +; RV32ZBA-NEXT: mv s0, s2 +; RV32ZBA-NEXT: .LBB40_2: # %entry +; RV32ZBA-NEXT: mv a0, s1 +; RV32ZBA-NEXT: mv a1, s0 +; RV32ZBA-NEXT: lw s3, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: addi sp, sp, 32 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulh a2, a0, a1 +; RV64ZBA-NEXT: mul a3, a0, a1 +; RV64ZBA-NEXT: srai a3, a3, 63 +; RV64ZBA-NEXT: bne a2, a3, .LBB40_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB40_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -1275,6 +2206,30 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: xor a0, a2, a0 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo.not.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi sp, sp, -16 +; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 +; RV32ZBA-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: .cfi_offset ra, -4 +; RV32ZBA-NEXT: sw zero, 8(sp) +; RV32ZBA-NEXT: addi a4, sp, 8 +; RV32ZBA-NEXT: call __mulodi4@plt +; RV32ZBA-NEXT: lw a0, 8(sp) +; RV32ZBA-NEXT: seqz a0, a0 +; RV32ZBA-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: addi sp, sp, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulh a2, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: srai a0, a0, 63 +; RV64ZBA-NEXT: xor a0, a2, a0 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -1303,6 +2258,27 @@ define i32 @umulo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB42_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo.select.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mulhu a2, a0, a1 +; RV32ZBA-NEXT: bnez a2, .LBB42_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a1 +; RV32ZBA-NEXT: .LBB42_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo.select.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: slli a2, a1, 32 +; RV64ZBA-NEXT: slli a3, a0, 32 +; RV64ZBA-NEXT: mulhu a2, a3, a2 +; RV64ZBA-NEXT: srli a2, a2, 32 +; RV64ZBA-NEXT: bnez a2, .LBB42_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB42_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -1325,6 +2301,21 @@ define i1 @umulo.not.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: srli a0, a0, 32 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo.not.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mulhu a0, a0, a1 +; RV32ZBA-NEXT: seqz a0, a0 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo.not.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: slli a1, a1, 32 +; RV64ZBA-NEXT: slli a0, a0, 32 +; RV64ZBA-NEXT: mulhu a0, a0, a1 +; RV64ZBA-NEXT: srli a0, a0, 32 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %obit = extractvalue {i32, i1} %t, 1 @@ -1366,6 +2357,40 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mv a0, a1 ; RV64-NEXT: .LBB44_2: # %entry ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo.select.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mul a4, a3, a0 +; RV32ZBA-NEXT: mul a5, a1, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mulhu a5, a0, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: sltu a6, a4, a5 +; RV32ZBA-NEXT: snez a5, a3 +; RV32ZBA-NEXT: snez a4, a1 +; RV32ZBA-NEXT: and a4, a4, a5 +; RV32ZBA-NEXT: mulhu a5, a1, a2 +; RV32ZBA-NEXT: snez a5, a5 +; RV32ZBA-NEXT: or a4, a4, a5 +; RV32ZBA-NEXT: mulhu a5, a3, a0 +; RV32ZBA-NEXT: snez a5, a5 +; RV32ZBA-NEXT: or a4, a4, a5 +; RV32ZBA-NEXT: or a4, a4, a6 +; RV32ZBA-NEXT: bnez a4, .LBB44_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: mv a0, a2 +; RV32ZBA-NEXT: mv a1, a3 +; RV32ZBA-NEXT: .LBB44_2: # %entry +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo.select.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulhu a2, a0, a1 +; RV64ZBA-NEXT: bnez a2, .LBB44_2 +; RV64ZBA-NEXT: # %bb.1: # %entry +; RV64ZBA-NEXT: mv a0, a1 +; RV64ZBA-NEXT: .LBB44_2: # %entry +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -1400,6 +2425,33 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mulhu a0, a0, a1 ; RV64-NEXT: seqz a0, a0 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo.not.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mul a4, a3, a0 +; RV32ZBA-NEXT: mul a5, a1, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mulhu a5, a0, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: sltu a6, a4, a5 +; RV32ZBA-NEXT: snez a5, a3 +; RV32ZBA-NEXT: snez a4, a1 +; RV32ZBA-NEXT: and a4, a4, a5 +; RV32ZBA-NEXT: mulhu a1, a1, a2 +; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: or a1, a4, a1 +; RV32ZBA-NEXT: mulhu a0, a3, a0 +; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: or a0, a1, a0 +; RV32ZBA-NEXT: or a0, a0, a6 +; RV32ZBA-NEXT: xori a0, a0, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo.not.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulhu a0, a0, a1 +; RV64ZBA-NEXT: seqz a0, a0 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %obit = extractvalue {i64, i1} %t, 1 @@ -1438,6 +2490,33 @@ define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: .LBB46_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo.br.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a2, a0, a1 +; RV32ZBA-NEXT: slt a0, a2, a0 +; RV32ZBA-NEXT: slti a1, a1, 0 +; RV32ZBA-NEXT: beq a1, a0, .LBB46_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB46_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: addw a0, a0, a1 +; RV64ZBA-NEXT: beq a0, a2, .LBB46_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB46_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -1482,6 +2561,37 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: .LBB47_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: saddo.br.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a4, a1, a3 +; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: sltu a0, a2, a0 +; RV32ZBA-NEXT: add a0, a4, a0 +; RV32ZBA-NEXT: xor a0, a1, a0 +; RV32ZBA-NEXT: xor a1, a1, a3 +; RV32ZBA-NEXT: not a1, a1 +; RV32ZBA-NEXT: and a0, a1, a0 +; RV32ZBA-NEXT: bgez a0, .LBB47_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB47_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: saddo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a2, a0, a1 +; RV64ZBA-NEXT: slt a0, a2, a0 +; RV64ZBA-NEXT: slti a1, a1, 0 +; RV64ZBA-NEXT: beq a1, a0, .LBB47_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB47_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -1518,6 +2628,29 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: .LBB48_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: uaddo.br.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a1, a0, a1 +; RV32ZBA-NEXT: bgeu a1, a0, .LBB48_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB48_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addw a1, a0, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB48_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB48_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -1560,6 +2693,35 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: .LBB49_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: uaddo.br.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a3, a1, a3 +; RV32ZBA-NEXT: add a2, a0, a2 +; RV32ZBA-NEXT: sltu a0, a2, a0 +; RV32ZBA-NEXT: add a2, a3, a0 +; RV32ZBA-NEXT: beq a2, a1, .LBB49_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: sltu a0, a2, a1 +; RV32ZBA-NEXT: .LBB49_2: # %entry +; RV32ZBA-NEXT: beqz a0, .LBB49_4 +; RV32ZBA-NEXT: # %bb.3: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB49_4: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: uaddo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a1, a0, a1 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB49_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB49_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -1600,6 +2762,33 @@ define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: .LBB50_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssubo.br.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sgtz a2, a1 +; RV32ZBA-NEXT: sub a1, a0, a1 +; RV32ZBA-NEXT: slt a0, a1, a0 +; RV32ZBA-NEXT: beq a2, a0, .LBB50_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB50_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: sub a2, a0, a1 +; RV64ZBA-NEXT: subw a0, a0, a1 +; RV64ZBA-NEXT: beq a0, a2, .LBB50_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB50_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -1642,6 +2831,35 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: .LBB51_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: ssubo.br.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sltu a0, a0, a2 +; RV32ZBA-NEXT: sub a2, a1, a3 +; RV32ZBA-NEXT: sub a0, a2, a0 +; RV32ZBA-NEXT: xor a0, a1, a0 +; RV32ZBA-NEXT: xor a1, a1, a3 +; RV32ZBA-NEXT: and a0, a1, a0 +; RV32ZBA-NEXT: bgez a0, .LBB51_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB51_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: ssubo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sgtz a2, a1 +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: slt a0, a1, a0 +; RV64ZBA-NEXT: beq a2, a0, .LBB51_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB51_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -1678,6 +2896,29 @@ define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: .LBB52_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: usubo.br.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sub a1, a0, a1 +; RV32ZBA-NEXT: bgeu a0, a1, .LBB52_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB52_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: usubo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: subw a1, a0, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: bgeu a0, a1, .LBB52_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB52_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -1722,6 +2963,37 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: .LBB53_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: usubo.br.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: sltu a4, a0, a2 +; RV32ZBA-NEXT: sub a3, a1, a3 +; RV32ZBA-NEXT: sub a3, a3, a4 +; RV32ZBA-NEXT: beq a3, a1, .LBB53_3 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: sltu a0, a1, a3 +; RV32ZBA-NEXT: bnez a0, .LBB53_4 +; RV32ZBA-NEXT: .LBB53_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB53_3: +; RV32ZBA-NEXT: sub a1, a0, a2 +; RV32ZBA-NEXT: sltu a0, a0, a1 +; RV32ZBA-NEXT: beqz a0, .LBB53_2 +; RV32ZBA-NEXT: .LBB53_4: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: usubo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sub a1, a0, a1 +; RV64ZBA-NEXT: bgeu a0, a1, .LBB53_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB53_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -1762,6 +3034,33 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: .LBB54_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo.br.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mulh a2, a0, a1 +; RV32ZBA-NEXT: mul a0, a0, a1 +; RV32ZBA-NEXT: srai a0, a0, 31 +; RV32ZBA-NEXT: beq a2, a0, .LBB54_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB54_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: sext.w a1, a1 +; RV64ZBA-NEXT: sext.w a0, a0 +; RV64ZBA-NEXT: mul a2, a0, a1 +; RV64ZBA-NEXT: mulw a0, a0, a1 +; RV64ZBA-NEXT: beq a0, a2, .LBB54_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB54_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -1809,6 +3108,40 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: .LBB55_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo.br.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi sp, sp, -16 +; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 +; RV32ZBA-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: .cfi_offset ra, -4 +; RV32ZBA-NEXT: sw zero, 8(sp) +; RV32ZBA-NEXT: addi a4, sp, 8 +; RV32ZBA-NEXT: call __mulodi4@plt +; RV32ZBA-NEXT: lw a0, 8(sp) +; RV32ZBA-NEXT: beqz a0, .LBB55_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: j .LBB55_3 +; RV32ZBA-NEXT: .LBB55_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: .LBB55_3: # %overflow +; RV32ZBA-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: addi sp, sp, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulh a2, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: srai a0, a0, 63 +; RV64ZBA-NEXT: beq a2, a0, .LBB55_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB55_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -1859,6 +3192,43 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV64-NEXT: .LBB56_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: smulo2.br.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: addi sp, sp, -16 +; RV32ZBA-NEXT: .cfi_def_cfa_offset 16 +; RV32ZBA-NEXT: sw ra, 12(sp) # 4-byte Folded Spill +; RV32ZBA-NEXT: .cfi_offset ra, -4 +; RV32ZBA-NEXT: sw zero, 8(sp) +; RV32ZBA-NEXT: addi a2, zero, -13 +; RV32ZBA-NEXT: addi a3, zero, -1 +; RV32ZBA-NEXT: addi a4, sp, 8 +; RV32ZBA-NEXT: call __mulodi4@plt +; RV32ZBA-NEXT: lw a0, 8(sp) +; RV32ZBA-NEXT: beqz a0, .LBB56_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: j .LBB56_3 +; RV32ZBA-NEXT: .LBB56_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: .LBB56_3: # %overflow +; RV32ZBA-NEXT: lw ra, 12(sp) # 4-byte Folded Reload +; RV32ZBA-NEXT: addi sp, sp, 16 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: smulo2.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: addi a1, zero, -13 +; RV64ZBA-NEXT: mulh a2, a0, a1 +; RV64ZBA-NEXT: mul a0, a0, a1 +; RV64ZBA-NEXT: srai a0, a0, 63 +; RV64ZBA-NEXT: beq a2, a0, .LBB56_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB56_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 -13) %val = extractvalue {i64, i1} %t, 0 @@ -1897,6 +3267,31 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: .LBB57_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo.br.i32: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mulhu a0, a0, a1 +; RV32ZBA-NEXT: beqz a0, .LBB57_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB57_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo.br.i32: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: slli a1, a1, 32 +; RV64ZBA-NEXT: slli a0, a0, 32 +; RV64ZBA-NEXT: mulhu a0, a0, a1 +; RV64ZBA-NEXT: srli a0, a0, 32 +; RV64ZBA-NEXT: beqz a0, .LBB57_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB57_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) %val = extractvalue {i32, i1} %t, 0 @@ -1947,6 +3342,43 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: .LBB58_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo.br.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: mul a4, a3, a0 +; RV32ZBA-NEXT: mul a5, a1, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: mulhu a5, a0, a2 +; RV32ZBA-NEXT: add a4, a5, a4 +; RV32ZBA-NEXT: sltu a6, a4, a5 +; RV32ZBA-NEXT: snez a5, a3 +; RV32ZBA-NEXT: snez a4, a1 +; RV32ZBA-NEXT: and a4, a4, a5 +; RV32ZBA-NEXT: mulhu a1, a1, a2 +; RV32ZBA-NEXT: snez a1, a1 +; RV32ZBA-NEXT: or a1, a4, a1 +; RV32ZBA-NEXT: mulhu a0, a3, a0 +; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: or a0, a1, a0 +; RV32ZBA-NEXT: or a0, a0, a6 +; RV32ZBA-NEXT: beqz a0, .LBB58_2 +; RV32ZBA-NEXT: # %bb.1: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB58_2: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: mulhu a0, a0, a1 +; RV64ZBA-NEXT: beqz a0, .LBB58_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB58_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) %val = extractvalue {i64, i1} %t, 0 @@ -1989,6 +3421,35 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) { ; RV64-NEXT: .LBB59_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo2.br.i64: +; RV32ZBA: # %bb.0: # %entry +; RV32ZBA-NEXT: add a2, a0, a0 +; RV32ZBA-NEXT: sltu a0, a2, a0 +; RV32ZBA-NEXT: add a2, a1, a1 +; RV32ZBA-NEXT: add a2, a2, a0 +; RV32ZBA-NEXT: beq a2, a1, .LBB59_2 +; RV32ZBA-NEXT: # %bb.1: # %entry +; RV32ZBA-NEXT: sltu a0, a2, a1 +; RV32ZBA-NEXT: .LBB59_2: # %entry +; RV32ZBA-NEXT: beqz a0, .LBB59_4 +; RV32ZBA-NEXT: # %bb.3: # %overflow +; RV32ZBA-NEXT: mv a0, zero +; RV32ZBA-NEXT: ret +; RV32ZBA-NEXT: .LBB59_4: # %continue +; RV32ZBA-NEXT: addi a0, zero, 1 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo2.br.i64: +; RV64ZBA: # %bb.0: # %entry +; RV64ZBA-NEXT: add a1, a0, a0 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB59_2 +; RV64ZBA-NEXT: # %bb.1: # %overflow +; RV64ZBA-NEXT: mv a0, zero +; RV64ZBA-NEXT: ret +; RV64ZBA-NEXT: .LBB59_2: # %continue +; RV64ZBA-NEXT: addi a0, zero, 1 +; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2) %val = extractvalue {i64, i1} %t, 0 -- GitLab From 07ed62b7d5514937a50b4af4feaa1969911d142e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 20 Mar 2021 15:14:46 -0700 Subject: [PATCH 0267/1000] [RISCV] Disable (mul (and X, 0xffffffff), (and Y, 0xffffffff)) optimization when Zba is enabled. This optimization is trying to save SRLI instructions needed to implement the ANDs. If we have zext.w we won't save anything. Because we don't check that the multiply is the only user of the AND we might even increase instruction count. --- llvm/lib/Target/RISCV/RISCVInstrInfoM.td | 2 ++ llvm/test/CodeGen/RISCV/xaluo.ll | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td index d38b5a98b31c..8d5f3e92355a 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td @@ -93,7 +93,9 @@ def : Pat<(and (riscv_remuw (assertzexti32 GPR:$rs1), // produce a result where res[63:32]=0 and res[31]=1. def : Pat<(srem (sexti32 (i64 GPR:$rs1)), (sexti32 (i64 GPR:$rs2))), (REMW GPR:$rs1, GPR:$rs2)>; +} // Predicates = [HasStdExtM, IsRV64] +let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in { // Special case for calculating the full 64-bit product of a 32x32 unsigned // multiply where the inputs aren't known to be zero extended. We can shift the // inputs left by 32 and use a MULHU. This saves two SRLIs needed to finish diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index 758cf4c41801..f34093e8d6f3 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -967,9 +967,9 @@ define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) { ; ; RV64ZBA-LABEL: umulo.i32: ; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: slli a1, a1, 32 -; RV64ZBA-NEXT: slli a0, a0, 32 -; RV64ZBA-NEXT: mulhu a1, a0, a1 +; RV64ZBA-NEXT: zext.w a1, a1 +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: mul a1, a0, a1 ; RV64ZBA-NEXT: srli a0, a1, 32 ; RV64ZBA-NEXT: snez a0, a0 ; RV64ZBA-NEXT: sw a1, 0(a2) @@ -2270,9 +2270,9 @@ define i32 @umulo.select.i32(i32 %v1, i32 %v2) { ; ; RV64ZBA-LABEL: umulo.select.i32: ; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: slli a2, a1, 32 -; RV64ZBA-NEXT: slli a3, a0, 32 -; RV64ZBA-NEXT: mulhu a2, a3, a2 +; RV64ZBA-NEXT: zext.w a2, a1 +; RV64ZBA-NEXT: zext.w a3, a0 +; RV64ZBA-NEXT: mul a2, a3, a2 ; RV64ZBA-NEXT: srli a2, a2, 32 ; RV64ZBA-NEXT: bnez a2, .LBB42_2 ; RV64ZBA-NEXT: # %bb.1: # %entry @@ -2310,9 +2310,9 @@ define i1 @umulo.not.i32(i32 %v1, i32 %v2) { ; ; RV64ZBA-LABEL: umulo.not.i32: ; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: slli a1, a1, 32 -; RV64ZBA-NEXT: slli a0, a0, 32 -; RV64ZBA-NEXT: mulhu a0, a0, a1 +; RV64ZBA-NEXT: zext.w a1, a1 +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: mul a0, a0, a1 ; RV64ZBA-NEXT: srli a0, a0, 32 ; RV64ZBA-NEXT: seqz a0, a0 ; RV64ZBA-NEXT: ret @@ -3281,9 +3281,9 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { ; ; RV64ZBA-LABEL: umulo.br.i32: ; RV64ZBA: # %bb.0: # %entry -; RV64ZBA-NEXT: slli a1, a1, 32 -; RV64ZBA-NEXT: slli a0, a0, 32 -; RV64ZBA-NEXT: mulhu a0, a0, a1 +; RV64ZBA-NEXT: zext.w a1, a1 +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: mul a0, a0, a1 ; RV64ZBA-NEXT: srli a0, a0, 32 ; RV64ZBA-NEXT: beqz a0, .LBB57_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow -- GitLab From b2bb00377452fd7f7901f1876807095fef340514 Mon Sep 17 00:00:00 2001 From: Jessica Clarke Date: Sat, 20 Mar 2021 22:35:40 +0000 Subject: [PATCH 0268/1000] [RISCV] Update comment in RISCVInstrInfoM.td Missed in 07ed62b7d551. --- llvm/lib/Target/RISCV/RISCVInstrInfoM.td | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td index 8d5f3e92355a..d6f8287f199c 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td @@ -104,4 +104,4 @@ let Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] in { // still be better off shifting both left by 32. def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))), (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>; -} // Predicates = [HasStdExtM, IsRV64] +} // Predicates = [HasStdExtM, IsRV64, NotHasStdExtZba] -- GitLab From 0776eca7a4e76bfadc311f3607be3a4f0c0e989a Mon Sep 17 00:00:00 2001 From: Andrew Litteken Date: Sat, 20 Mar 2021 18:03:02 -0500 Subject: [PATCH 0269/1000] Revert "[IRSim] Adding basic implementation of llvm-sim." Causing build errors on the Windows Buildbots. This reverts commit 5155dff2784a47583d432d796b7cf47a0bed9f20. --- llvm/test/CMakeLists.txt | 1 - llvm/test/lit.cfg.py | 2 +- llvm/test/tools/llvm-sim/Inputs/sim1.ll | 27 ---- llvm/test/tools/llvm-sim/fail-cases.test | 8 - llvm/test/tools/llvm-sim/single-sim-file.test | 57 ------- llvm/test/tools/llvm-sim/single-sim.test | 56 ------- llvm/tools/llvm-sim/CMakeLists.txt | 9 -- llvm/tools/llvm-sim/llvm-sim.cpp | 149 ------------------ 8 files changed, 1 insertion(+), 308 deletions(-) delete mode 100644 llvm/test/tools/llvm-sim/Inputs/sim1.ll delete mode 100644 llvm/test/tools/llvm-sim/fail-cases.test delete mode 100644 llvm/test/tools/llvm-sim/single-sim-file.test delete mode 100644 llvm/test/tools/llvm-sim/single-sim.test delete mode 100644 llvm/tools/llvm-sim/CMakeLists.txt delete mode 100644 llvm/tools/llvm-sim/llvm-sim.cpp diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt index 0c72adca931b..7c4fa2e9033a 100644 --- a/llvm/test/CMakeLists.txt +++ b/llvm/test/CMakeLists.txt @@ -109,7 +109,6 @@ set(LLVM_TEST_DEPENDS llvm-readelf llvm-reduce llvm-rtdyld - llvm-sim llvm-size llvm-split llvm-strings diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py index 244d69e01cfc..2a1ccc2dcfbd 100644 --- a/llvm/test/lit.cfg.py +++ b/llvm/test/lit.cfg.py @@ -162,7 +162,7 @@ tools.extend([ 'llvm-link', 'llvm-lto', 'llvm-lto2', 'llvm-mc', 'llvm-mca', 'llvm-modextract', 'llvm-nm', 'llvm-objcopy', 'llvm-objdump', 'llvm-pdbutil', 'llvm-profdata', 'llvm-ranlib', 'llvm-rc', 'llvm-readelf', - 'llvm-readobj', 'llvm-rtdyld', 'llvm-sim', 'llvm-size', 'llvm-split', 'llvm-strings', + 'llvm-readobj', 'llvm-rtdyld', 'llvm-size', 'llvm-split', 'llvm-strings', 'llvm-strip', 'llvm-tblgen', 'llvm-undname', 'llvm-c-test', 'llvm-cxxfilt', 'llvm-xray', 'yaml2obj', 'obj2yaml', 'yaml-bench', 'verify-uselistorder', 'bugpoint', 'llc', 'llvm-symbolizer', 'opt', 'sancov', 'sanstats']) diff --git a/llvm/test/tools/llvm-sim/Inputs/sim1.ll b/llvm/test/tools/llvm-sim/Inputs/sim1.ll deleted file mode 100644 index facc27d285b0..000000000000 --- a/llvm/test/tools/llvm-sim/Inputs/sim1.ll +++ /dev/null @@ -1,27 +0,0 @@ -define void @similar_func1() { -entry: - %a = alloca i32, align 4 - %b = alloca i32, align 4 - %c = alloca i32, align 4 - store i32 2, i32* %a, align 4 - store i32 3, i32* %b, align 4 - store i32 4, i32* %c, align 4 - %al = load i32, i32* %a - %bl = load i32, i32* %b - %cl = load i32, i32* %c - ret void -} - -define void @similar_func2() { -entry: - %a = alloca i32, align 4 - %b = alloca i32, align 4 - %c = alloca i32, align 4 - store i32 2, i32* %a, align 4 - store i32 3, i32* %b, align 4 - store i32 4, i32* %c, align 4 - %al = load i32, i32* %a - %bl = load i32, i32* %b - %cl = load i32, i32* %c - ret void -} diff --git a/llvm/test/tools/llvm-sim/fail-cases.test b/llvm/test/tools/llvm-sim/fail-cases.test deleted file mode 100644 index 41e3a5617acb..000000000000 --- a/llvm/test/tools/llvm-sim/fail-cases.test +++ /dev/null @@ -1,8 +0,0 @@ -# RUN: not llvm-sim %s 2>&1 | FileCheck %s -# RUN: not llvm-sim %s.2 2>&1 | FileCheck %s --check-prefix=EXIST - -# File reading error messaging tests. - -# CHECK: error: expected top-level entity - -# EXIST: error: Could not open input file: No such file or directory diff --git a/llvm/test/tools/llvm-sim/single-sim-file.test b/llvm/test/tools/llvm-sim/single-sim-file.test deleted file mode 100644 index 5e45edf12c2c..000000000000 --- a/llvm/test/tools/llvm-sim/single-sim-file.test +++ /dev/null @@ -1,57 +0,0 @@ -# RUN: llvm-sim -o %t %S/Inputs/sim1.ll -# RUN: FileCheck %s < %t - -# Checking the output of a single module test. - -# CHECK: { -# CHECK-NEXT: "1": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 8, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 18, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ], -# CHECK-NEXT: "2": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 7, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 17, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ], -# CHECK-NEXT: "3": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 6, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 16, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ], -# CHECK-NEXT: "4": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 5, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 15, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ], -# CHECK-NEXT: "5": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 4, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 14, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ] -# CHECK-NEXT: } diff --git a/llvm/test/tools/llvm-sim/single-sim.test b/llvm/test/tools/llvm-sim/single-sim.test deleted file mode 100644 index 4e04682e294e..000000000000 --- a/llvm/test/tools/llvm-sim/single-sim.test +++ /dev/null @@ -1,56 +0,0 @@ -# RUN: llvm-sim -o - %S/Inputs/sim1.ll | FileCheck %s - -# Checking the output of a single module test. - -# CHECK: { -# CHECK-NEXT: "1": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 8, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 18, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ], -# CHECK-NEXT: "2": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 7, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 17, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ], -# CHECK-NEXT: "3": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 6, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 16, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ], -# CHECK-NEXT: "4": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 5, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 15, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ], -# CHECK-NEXT: "5": [ -# CHECK-NEXT: { -# CHECK-NEXT: "start": 4, -# CHECK-NEXT: "end": 9 -# CHECK-NEXT: }, -# CHECK-NEXT: { -# CHECK-NEXT: "start": 14, -# CHECK-NEXT: "end": 19 -# CHECK-NEXT: } -# CHECK-NEXT: ] -# CHECK-NEXT: } diff --git a/llvm/tools/llvm-sim/CMakeLists.txt b/llvm/tools/llvm-sim/CMakeLists.txt deleted file mode 100644 index 76299050392a..000000000000 --- a/llvm/tools/llvm-sim/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Core - Support - Analysis - IRReader) - -add_llvm_tool(llvm-sim - llvm-sim.cpp -) diff --git a/llvm/tools/llvm-sim/llvm-sim.cpp b/llvm/tools/llvm-sim/llvm-sim.cpp deleted file mode 100644 index 26e370ff30f1..000000000000 --- a/llvm/tools/llvm-sim/llvm-sim.cpp +++ /dev/null @@ -1,149 +0,0 @@ -//===-- llvm-sim.cpp - Find similar sections of programs -------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This program finds similar sections of a Module, and exports them as a JSON -// file. -// -// To find similarities contained across multiple modules, please use llvm-link -// first to merge the modules. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Analysis/IRSimilarityIdentifier.h" -#include "llvm/IRReader/IRReader.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/FileSystem.h" -#include "llvm/Support/InitLLVM.h" -#include "llvm/Support/JSON.h" -#include "llvm/Support/SourceMgr.h" -#include "llvm/Support/ToolOutputFile.h" - -using namespace llvm; -using namespace IRSimilarity; - -static cl::opt OutputFilename("o", cl::desc("Output Filename"), - cl::init("-"), - cl::value_desc("filename")); - -static cl::opt InputSourceFile(cl::Positional, - cl::desc(""), - cl::init("-"), - cl::value_desc("filename")); - -/// Retrieve the unique number \p I was mapped to in parseBitcodeFile. -/// -/// \param I - The Instruction to find the instruction number for. -/// \param LLVMInstNum - The mapping of Instructions to their location in the -/// module represented by an unsigned integer. -/// \returns The instruction number for \p I if it exists. -Optional -getPositionInModule(const Instruction *I, - const DenseMap &LLVMInstNum) { - assert(I && "Instruction is nullptr!"); - DenseMap::const_iterator It = LLVMInstNum.find(I); - if (It == LLVMInstNum.end()) - return None; - return It->second; -} - -/// Exports the given SimilarityGroups to a JSON file at \p FilePath. -/// -/// \param FilePath - The path to the output location. -/// \param SimSections - The similarity groups to process. -/// \param LLVMInstNum - The mapping of Instructions to their location in the -/// module represented by an unsigned integer. -/// \returns A nonzero error code if there was a failure creating the file. -std::error_code -exportToFile(const StringRef FilePath, - const SimilarityGroupList &SimSections, - const DenseMap &LLVMInstNum) { - std::error_code EC; - std::unique_ptr Out( - new ToolOutputFile(FilePath, EC, sys::fs::OF_None)); - if (EC) - return EC; - - json::OStream J(Out->os(), 1); - J.objectBegin(); - - unsigned SimOption = 1; - // Process each list of SimilarityGroups organized by the Module. - for (const SimilarityGroup &G : SimSections) { - std::string SimOptionStr = std::to_string(SimOption); - J.attributeBegin(SimOptionStr); - J.arrayBegin(); - // For each file there is a list of the range where the similarity - // exists. - for (const IRSimilarityCandidate &C : G) { - Optional Start = - getPositionInModule((*C.front()).Inst, LLVMInstNum); - Optional End = - getPositionInModule((*C.back()).Inst, LLVMInstNum); - - assert(Start.hasValue() && - "Could not find instruction number for first instruction"); - assert(End.hasValue() && - "Could not find instruction number for last instruction"); - - J.object([&] { - J.attribute("start", Start.getValue()); - J.attribute("end", End.getValue()); - }); - } - J.arrayEnd(); - J.attributeEnd(); - SimOption++; - } - J.objectEnd(); - - Out->keep(); - - return EC; -} - -int main(int argc, const char *argv[]) { - InitLLVM X(argc, argv); - - cl::ParseCommandLineOptions(argc, argv, "LLVM IR Similarity Visualizer\n"); - - LLVMContext CurrContext; - SMDiagnostic Err; - std::unique_ptr ModuleToAnalyze = - parseIRFile(InputSourceFile, Err, CurrContext); - - if (!ModuleToAnalyze) { - Err.print(argv[0], errs()); - return 1; - } - - // Mapping from an Instruction pointer to its occurrence in a sequential - // list of all the Instructions in a Module. - DenseMap LLVMInstNum; - - // We give each instruction a number, which gives us a start and end value - // for the beginning and end of each IRSimilarityCandidate. - unsigned InstructionNumber = 1; - for (Function &F : *ModuleToAnalyze) - for (BasicBlock &BB : F) - for (Instruction &I : BB.instructionsWithoutDebug()) - LLVMInstNum[&I]= InstructionNumber++; - - // The similarity identifier we will use to find the similar sections. - IRSimilarityIdentifier SimIdent; - SimilarityGroupList SimilaritySections = - SimIdent.findSimilarity(*ModuleToAnalyze); - - std::error_code E = - exportToFile(OutputFilename, SimilaritySections, LLVMInstNum); - if (E) { - errs() << argv[0] << ": " << E.message() << '\n'; - return 2; - } - - return 0; -} -- GitLab From 361b7d125b438cda13fa45f13790767a62252be9 Mon Sep 17 00:00:00 2001 From: Chris Lattner Date: Fri, 19 Mar 2021 21:22:15 -0700 Subject: [PATCH 0270/1000] [Canonicalizer] Process regions top-down instead of bottom up & reuse existing constants. This reapplies b5d9a3c / https://reviews.llvm.org/D98609 with a one line fix in processExistingConstants to skip() when erasing a constant we've already seen. Original commit message: 1) Change the canonicalizer to walk the function in top-down order instead of bottom-up order. This composes well with the "top down" nature of constant folding and simplification, reducing iterations and re-evaluation of ops in simple cases. 2) Explicitly enter existing constants into the OperationFolder table before canonicalizing. Previously we would "constant fold" them and rematerialize them, wastefully recreating a bunch fo constants, which lead to pointless memory traffic. Both changes together provide a 33% speedup for canonicalize on some mid-size CIRCT examples. One artifact of this change is that the constants generated in normal pattern application get inserted at the top of the function as the patterns are applied. Because of this, we get "inverted" constants more often, which is an aethetic change to the IR but does permute some testcases. Differential Revision: https://reviews.llvm.org/D99006 --- mlir/include/mlir/Transforms/FoldUtils.h | 6 +- mlir/lib/Transforms/Utils/FoldUtils.cpp | 97 ++++++++++++++++--- .../Utils/GreedyPatternRewriteDriver.cpp | 30 ++++-- .../VectorToSCF/vector-to-loops.mlir | 24 ++--- mlir/test/Dialect/Affine/canonicalize.mlir | 6 +- .../Dialect/Linalg/transform-patterns.mlir | 2 +- mlir/test/Dialect/Vector/canonicalize.mlir | 4 +- mlir/test/Transforms/canonicalize.mlir | 4 +- mlir/test/mlir-tblgen/pattern.mlir | 6 +- 9 files changed, 138 insertions(+), 41 deletions(-) diff --git a/mlir/include/mlir/Transforms/FoldUtils.h b/mlir/include/mlir/Transforms/FoldUtils.h index ad406cb18085..c31ac15eb9c9 100644 --- a/mlir/include/mlir/Transforms/FoldUtils.h +++ b/mlir/include/mlir/Transforms/FoldUtils.h @@ -23,7 +23,6 @@ namespace mlir { class Operation; class Value; - //===--------------------------------------------------------------------===// // OperationFolder //===--------------------------------------------------------------------===// @@ -34,6 +33,11 @@ class OperationFolder { public: OperationFolder(MLIRContext *ctx) : interfaces(ctx) {} + /// Scan the specified region for constants that can be used in folding, + /// moving them to the entry block and adding them to our known-constants + /// table. + void processExistingConstants(Region ®ion); + /// Tries to perform folding on the given `op`, including unifying /// deduplicated constants. If successful, replaces `op`'s uses with /// folded results, and returns success. `preReplaceAction` is invoked on `op` diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp index 024ae1892861..616a6ef6af57 100644 --- a/mlir/lib/Transforms/Utils/FoldUtils.cpp +++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp @@ -84,6 +84,81 @@ static Operation *materializeConstant(Dialect *dialect, OpBuilder &builder, // OperationFolder //===----------------------------------------------------------------------===// +/// Scan the specified region for constants that can be used in folding, +/// moving them to the entry block and adding them to our known-constants +/// table. +void OperationFolder::processExistingConstants(Region ®ion) { + if (region.empty()) + return; + + // March the constant insertion point forward, moving all constants to the + // top of the block, but keeping them in their order of discovery. + Region *insertRegion = getInsertionRegion(interfaces, ®ion.front()); + auto &uniquedConstants = foldScopes[insertRegion]; + + Block &insertBlock = insertRegion->front(); + Block::iterator constantIterator = insertBlock.begin(); + + // Process each constant that we discover in this region. + auto processConstant = [&](Operation *op, Attribute value) { + // Check to see if we already have an instance of this constant. + Operation *&constOp = uniquedConstants[std::make_tuple( + op->getDialect(), value, op->getResult(0).getType())]; + + // If we already have an instance of this constant, CSE/delete this one as + // we go. + if (constOp) { + if (constantIterator == Block::iterator(op)) + ++constantIterator; // Don't invalidate our iterator when scanning. + op->getResult(0).replaceAllUsesWith(constOp->getResult(0)); + op->erase(); + return; + } + + // Otherwise, remember that we have this constant. + constOp = op; + referencedDialects[op].push_back(op->getDialect()); + + // If the constant isn't already at the insertion point then move it up. + if (constantIterator == insertBlock.end() || &*constantIterator != op) + op->moveBefore(&insertBlock, constantIterator); + else + ++constantIterator; // It was pointing at the constant. + }; + + SmallVector isolatedOps; + region.walk([&](Operation *op) { + // If this is a constant, process it. + Attribute value; + if (matchPattern(op, m_Constant(&value))) { + processConstant(op, value); + // We may have deleted the operation, don't check it for regions. + return WalkResult::skip(); + } + + // If the operation has regions and is isolated, don't recurse into it. + if (op->getNumRegions() != 0) { + auto hasDifferentInsertRegion = [&](Region ®ion) { + return !region.empty() && + getInsertionRegion(interfaces, ®ion.front()) != insertRegion; + }; + if (llvm::any_of(op->getRegions(), hasDifferentInsertRegion)) { + isolatedOps.push_back(op); + return WalkResult::skip(); + } + } + + // Otherwise keep going. + return WalkResult::advance(); + }); + + // Process regions in any isolated ops separately. + for (Operation *isolated : isolatedOps) { + for (Region ®ion : isolated->getRegions()) + processExistingConstants(region); + } +} + LogicalResult OperationFolder::tryToFold( Operation *op, function_ref processGeneratedConstants, function_ref preReplaceAction, bool *inPlaceUpdate) { @@ -262,19 +337,19 @@ Operation *OperationFolder::tryGetOrCreateConstant( Attribute value, Type type, Location loc) { // Check if an existing mapping already exists. auto constKey = std::make_tuple(dialect, value, type); - auto *&constInst = uniquedConstants[constKey]; - if (constInst) - return constInst; + auto *&constOp = uniquedConstants[constKey]; + if (constOp) + return constOp; // If one doesn't exist, try to materialize one. - if (!(constInst = materializeConstant(dialect, builder, value, type, loc))) + if (!(constOp = materializeConstant(dialect, builder, value, type, loc))) return nullptr; // Check to see if the generated constant is in the expected dialect. - auto *newDialect = constInst->getDialect(); + auto *newDialect = constOp->getDialect(); if (newDialect == dialect) { - referencedDialects[constInst].push_back(dialect); - return constInst; + referencedDialects[constOp].push_back(dialect); + return constOp; } // If it isn't, then we also need to make sure that the mapping for the new @@ -284,13 +359,13 @@ Operation *OperationFolder::tryGetOrCreateConstant( // If an existing operation in the new dialect already exists, delete the // materialized operation in favor of the existing one. if (auto *existingOp = uniquedConstants.lookup(newKey)) { - constInst->erase(); + constOp->erase(); referencedDialects[existingOp].push_back(dialect); - return constInst = existingOp; + return constOp = existingOp; } // Otherwise, update the new dialect to the materialized operation. - referencedDialects[constInst].assign({dialect, newDialect}); - auto newIt = uniquedConstants.insert({newKey, constInst}); + referencedDialects[constOp].assign({dialect, newDialect}); + auto newIt = uniquedConstants.insert({newKey, constOp}); return newIt.first->second; } diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp index 922fbb1bee06..38aa749ae628 100644 --- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp +++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp @@ -107,7 +107,8 @@ private: // be re-added to the worklist. This function should be called when an // operation is modified or removed, as it may trigger further // simplifications. - template void addToWorklist(Operands &&operands) { + template + void addToWorklist(Operands &&operands) { for (Value operand : operands) { // If the use count of this operand is now < 2, we re-add the defining // operation to the worklist. @@ -140,15 +141,26 @@ private: /// if the rewrite converges in `maxIterations`. bool GreedyPatternRewriteDriver::simplify(MutableArrayRef regions, int maxIterations) { - // Add the given operation to the worklist. - auto collectOps = [this](Operation *op) { addToWorklist(op); }; + // Perform a prepass over the IR to discover constants. + for (auto ®ion : regions) + folder.processExistingConstants(region); bool changed = false; - int i = 0; + int iteration = 0; do { - // Add all nested operations to the worklist. + worklist.clear(); + worklistMap.clear(); + + // Add all nested operations to the worklist in preorder. for (auto ®ion : regions) - region.walk(collectOps); + region.walk( + [this](Operation *op) { worklist.push_back(op); }); + + // Reverse the list so our pop-back loop processes them in-order. + std::reverse(worklist.begin(), worklist.end()); + // Remember the reverse index. + for (unsigned i = 0, e = worklist.size(); i != e; ++i) + worklistMap[worklist[i]] = i; // These are scratch vectors used in the folding loop below. SmallVector originalOperands, resultValues; @@ -186,6 +198,9 @@ bool GreedyPatternRewriteDriver::simplify(MutableArrayRef regions, notifyOperationRemoved(op); }; + // Add the given operation to the worklist. + auto collectOps = [this](Operation *op) { addToWorklist(op); }; + // Try to fold this op. bool inPlaceUpdate; if ((succeeded(folder.tryToFold(op, collectOps, preReplaceAction, @@ -203,7 +218,8 @@ bool GreedyPatternRewriteDriver::simplify(MutableArrayRef regions, // After applying patterns, make sure that the CFG of each of the regions is // kept up to date. changed |= succeeded(simplifyRegions(*this, regions)); - } while (changed && ++i < maxIterations); + } while (changed && ++iteration < maxIterations); + // Whether the rewrite converges, i.e. wasn't changed in the last iteration. return !changed; } diff --git a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir index 1ebacc8ef274..47896c010433 100644 --- a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir +++ b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir @@ -204,12 +204,13 @@ func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x // CHECK-DAG: %[[C0:.*]] = constant 0 : index // CHECK-DAG: %[[splat:.*]] = constant dense<7.000000e+00> : vector<15xf32> // CHECK-DAG: %[[alloc:.*]] = memref.alloca() : memref<3xvector<15xf32>> + // CHECK-DAG: [[CST:%.*]] = constant 7.000000e+00 : f32 // CHECK-DAG: %[[dim:.*]] = memref.dim %[[A]], %[[C0]] : memref // CHECK: affine.for %[[I:.*]] = 0 to 3 { // CHECK: %[[add:.*]] = affine.apply #[[$MAP0]](%[[I]])[%[[base]]] // CHECK: %[[cond1:.*]] = cmpi slt, %[[add]], %[[dim]] : index // CHECK: scf.if %[[cond1]] { - // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%[[add]], %[[base]]], %cst : memref, vector<15xf32> + // CHECK: %[[vec_1d:.*]] = vector.transfer_read %[[A]][%[[add]], %[[base]]], [[CST]] : memref, vector<15xf32> // CHECK: store %[[vec_1d]], %[[alloc]][%[[I]]] : memref<3xvector<15xf32>> // CHECK: } else { // CHECK: store %[[splat]], %[[alloc]][%[[I]]] : memref<3xvector<15xf32>> @@ -217,13 +218,14 @@ func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<3xvector<15xf32>> to memref> // CHECK: %[[cst:.*]] = memref.load %[[vmemref]][] : memref> - // FULL-UNROLL: %[[VEC0:.*]] = constant dense<7.000000e+00> : vector<3x15xf32> - // FULL-UNROLL: %[[C0:.*]] = constant 0 : index - // FULL-UNROLL: %[[SPLAT:.*]] = constant dense<7.000000e+00> : vector<15xf32> + // FULL-UNROLL-DAG: %[[VEC0:.*]] = constant dense<7.000000e+00> : vector<3x15xf32> + // FULL-UNROLL-DAG: %[[C0:.*]] = constant 0 : index + // FULL-UNROLL-DAG: %[[SPLAT:.*]] = constant dense<7.000000e+00> : vector<15xf32> + // FULL-UNROLL-DAG: [[CST:%.*]] = constant 7.000000e+00 : f32 // FULL-UNROLL: %[[DIM:.*]] = memref.dim %[[A]], %[[C0]] : memref // FULL-UNROLL: cmpi slt, %[[base]], %[[DIM]] : index // FULL-UNROLL: %[[VEC1:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], %cst : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%[[base]], %[[base]]], [[CST]] : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC0]] [0] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -233,7 +235,7 @@ func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x // FULL-UNROLL: affine.apply #[[$MAP1]]()[%[[base]]] // FULL-UNROLL: cmpi slt, %{{.*}}, %[[DIM]] : index // FULL-UNROLL: %[[VEC2:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %cst : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], [[CST]] : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC1]] [1] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -243,7 +245,7 @@ func @transfer_read_progressive(%A : memref, %base: index) -> vector<3x // FULL-UNROLL: affine.apply #[[$MAP2]]()[%[[base]]] // FULL-UNROLL: cmpi slt, %{{.*}}, %[[DIM]] : index // FULL-UNROLL: %[[VEC3:.*]] = scf.if %{{.*}} -> (vector<3x15xf32>) { - // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], %cst : memref, vector<15xf32> + // FULL-UNROLL: vector.transfer_read %[[A]][%{{.*}}, %[[base]]], [[CST]] : memref, vector<15xf32> // FULL-UNROLL: vector.insert %{{.*}}, %[[VEC2]] [2] : vector<15xf32> into vector<3x15xf32> // FULL-UNROLL: scf.yield %{{.*}} : vector<3x15xf32> // FULL-UNROLL: } else { @@ -377,16 +379,16 @@ func @transfer_read_minor_identity(%A : memref) -> vector<3x3xf32> // CHECK-LABEL: transfer_read_minor_identity( // CHECK-SAME: %[[A:.*]]: memref) -> vector<3x3xf32> -// CHECK-DAG: %[[c0:.*]] = constant 0 : index -// CHECK-DAG: %cst = constant 0.000000e+00 : f32 // CHECK-DAG: %[[c2:.*]] = constant 2 : index // CHECK-DAG: %[[cst0:.*]] = constant dense<0.000000e+00> : vector<3xf32> // CHECK: %[[m:.*]] = memref.alloca() : memref<3xvector<3xf32>> +// CHECK-DAG: %[[cst:.*]] = constant 0.000000e+00 : f32 +// CHECK-DAG: %[[c0:.*]] = constant 0 : index // CHECK: %[[d:.*]] = memref.dim %[[A]], %[[c2]] : memref // CHECK: affine.for %[[arg1:.*]] = 0 to 3 { // CHECK: %[[cmp:.*]] = cmpi slt, %[[arg1]], %[[d]] : index // CHECK: scf.if %[[cmp]] { -// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %cst : memref, vector<3xf32> +// CHECK: %[[tr:.*]] = vector.transfer_read %[[A]][%c0, %c0, %[[arg1]], %c0], %[[cst]] : memref, vector<3xf32> // CHECK: store %[[tr]], %[[m]][%[[arg1]]] : memref<3xvector<3xf32>> // CHECK: } else { // CHECK: store %[[cst0]], %[[m]][%[[arg1]]] : memref<3xvector<3xf32>> @@ -409,8 +411,8 @@ func @transfer_write_minor_identity(%A : vector<3x3xf32>, %B : memref, // CHECK-SAME: %[[B:.*]]: memref) // CHECK-DAG: %[[c2:.*]] = constant 2 : index -// CHECK-DAG: %[[c0:.*]] = constant 0 : index // CHECK: %[[m:.*]] = memref.alloca() : memref<3xvector<3xf32>> +// CHECK-DAG: %[[c0:.*]] = constant 0 : index // CHECK: %[[cast:.*]] = vector.type_cast %[[m]] : memref<3xvector<3xf32>> to memref> // CHECK: store %[[A]], %[[cast]][] : memref> // CHECK: %[[d:.*]] = memref.dim %[[B]], %[[c2]] : memref diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir index 91547420015f..1fddf5c882c1 100644 --- a/mlir/test/Dialect/Affine/canonicalize.mlir +++ b/mlir/test/Dialect/Affine/canonicalize.mlir @@ -207,7 +207,7 @@ func @compose_affine_maps_diamond_dependency(%arg0: f32, %arg1: memref<4x4xf32>) // ----- -// CHECK-DAG: #[[$MAP14:.*]] = affine_map<()[s0, s1] -> (((s1 + s0) * 4) floordiv s0)> +// CHECK-DAG: #[[$MAP14:.*]] = affine_map<()[s0, s1] -> ((s0 * 4 + s1 * 4) floordiv s0)> // CHECK-LABEL: func @compose_affine_maps_multiple_symbols func @compose_affine_maps_multiple_symbols(%arg0: index, %arg1: index) -> index { @@ -312,7 +312,7 @@ func @symbolic_composition_c(%arg0: index, %arg1: index, %arg2: index, %arg3: in // ----- -// CHECK-DAG: #[[$MAP_symbolic_composition_d:.*]] = affine_map<()[s0, s1] -> (s0 + s1 * 3)> +// CHECK-DAG: #[[$MAP_symbolic_composition_d:.*]] = affine_map<()[s0, s1] -> (s0 * 3 + s1)> // CHECK-LABEL: func @symbolic_composition_d( // CHECK-SAME: %[[ARG0:[0-9a-zA-Z]+]]: index @@ -321,7 +321,7 @@ func @symbolic_composition_d(%arg0: index, %arg1: index, %arg2: index, %arg3: in %0 = affine.apply affine_map<(d0) -> (d0)>(%arg0) %1 = affine.apply affine_map<()[s0] -> (s0)>()[%arg1] %2 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s1 + s2 + s3)>()[%0, %0, %0, %1] - // CHECK: %{{.*}} = affine.apply #[[$MAP_symbolic_composition_d]]()[%[[ARG1]], %[[ARG0]]] + // CHECK: %{{.*}} = affine.apply #[[$MAP_symbolic_composition_d]]()[%[[ARG0]], %[[ARG1]]] return %2 : index } diff --git a/mlir/test/Dialect/Linalg/transform-patterns.mlir b/mlir/test/Dialect/Linalg/transform-patterns.mlir index a70816984c00..95555ceb6844 100644 --- a/mlir/test/Dialect/Linalg/transform-patterns.mlir +++ b/mlir/test/Dialect/Linalg/transform-patterns.mlir @@ -336,7 +336,7 @@ func @aligned_promote_fill(%arg0: memref) { return } // CHECK-LABEL: func @aligned_promote_fill -// CHECK: %[[cf:.*]] = constant {{.*}} : f32 +// CHECK: %[[cf:.*]] = constant 1.0{{.*}} : f32 // CHECK: %[[s0:.*]] = memref.subview {{%.*}}[{{%.*}}, {{%.*}}] [{{%.*}}, {{%.*}}] [{{%.*}}, {{%.*}}] : memref to memref // CHECK: %[[a0:.*]] = memref.alloc({{%.*}}) {alignment = 32 : i64} : memref // CHECK: %[[v0:.*]] = memref.view %[[a0]][{{.*}}][{{%.*}}, {{%.*}}] : memref to memref diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir index c6ec156e1519..a0448ea32967 100644 --- a/mlir/test/Dialect/Vector/canonicalize.mlir +++ b/mlir/test/Dialect/Vector/canonicalize.mlir @@ -234,10 +234,10 @@ func @transpose_3D_sequence(%arg : vector<4x3x2xf32>) -> vector<4x3x2xf32> { // CHECK: [[T0:%.*]] = vector.transpose [[ARG]], [2, 1, 0] %0 = vector.transpose %arg, [1, 2, 0] : vector<4x3x2xf32> to vector<3x2x4xf32> %1 = vector.transpose %0, [1, 0, 2] : vector<3x2x4xf32> to vector<2x3x4xf32> - // CHECK-NOT: transpose + // CHECK: [[T1:%.*]] = vector.transpose [[ARG]], [2, 1, 0] %2 = vector.transpose %1, [2, 1, 0] : vector<2x3x4xf32> to vector<4x3x2xf32> %3 = vector.transpose %2, [2, 1, 0] : vector<4x3x2xf32> to vector<2x3x4xf32> - // CHECK: [[MUL:%.*]] = mulf [[T0]], [[T0]] + // CHECK: [[MUL:%.*]] = mulf [[T0]], [[T1]] %4 = mulf %1, %3 : vector<2x3x4xf32> // CHECK: [[T5:%.*]] = vector.transpose [[MUL]], [2, 1, 0] %5 = vector.transpose %4, [2, 1, 0] : vector<2x3x4xf32> to vector<4x3x2xf32> diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir index a65c46452cc8..c6e535723b44 100644 --- a/mlir/test/Transforms/canonicalize.mlir +++ b/mlir/test/Transforms/canonicalize.mlir @@ -630,7 +630,7 @@ func @lowered_affine_floordiv() -> (index, index) { // // CHECK-LABEL: func @lowered_affine_ceildiv func @lowered_affine_ceildiv() -> (index, index) { -// CHECK-NEXT: %c-1 = constant -1 : index +// CHECK-DAG: %c-1 = constant -1 : index %c-43 = constant -43 : index %c42 = constant 42 : index %c0 = constant 0 : index @@ -643,7 +643,7 @@ func @lowered_affine_ceildiv() -> (index, index) { %5 = subi %c0, %4 : index %6 = addi %4, %c1 : index %7 = select %0, %5, %6 : index -// CHECK-NEXT: %c2 = constant 2 : index +// CHECK-DAG: %c2 = constant 2 : index %c43 = constant 43 : index %c42_0 = constant 42 : index %c0_1 = constant 0 : index diff --git a/mlir/test/mlir-tblgen/pattern.mlir b/mlir/test/mlir-tblgen/pattern.mlir index 0425cf819e60..100a7bae7689 100644 --- a/mlir/test/mlir-tblgen/pattern.mlir +++ b/mlir/test/mlir-tblgen/pattern.mlir @@ -5,8 +5,8 @@ func @verifyFusedLocs(%arg0 : i32) -> i32 { %0 = "test.op_a"(%arg0) {attr = 10 : i32} : (i32) -> i32 loc("a") %result = "test.op_a"(%0) {attr = 20 : i32} : (i32) -> i32 loc("b") - // CHECK: "test.op_b"(%arg0) {attr = 10 : i32} : (i32) -> i32 loc("a") - // CHECK: "test.op_b"(%arg0) {attr = 20 : i32} : (i32) -> i32 loc(fused["b", "a"]) + // CHECK: %0 = "test.op_b"(%arg0) {attr = 10 : i32} : (i32) -> i32 loc("a") + // CHECK: %1 = "test.op_b"(%0) {attr = 20 : i32} : (i32) -> i32 loc("b") return %result : i32 } @@ -67,7 +67,7 @@ func @verifyBenefit(%arg0 : i32) -> i32 { %2 = "test.op_g"(%1) : (i32) -> i32 // CHECK: "test.op_f"(%arg0) - // CHECK: "test.op_b"(%arg0) {attr = 34 : i32} + // CHECK: "test.op_b"(%arg0) {attr = 20 : i32} return %0 : i32 } -- GitLab From 27bc30c39d62f50fd762a8bcc2dcb0401e7263f7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 20 Mar 2021 17:43:30 -0700 Subject: [PATCH 0271/1000] [RISCV] Add test case to show a case where (mul (and X, 0xffffffff), (and Y, 0xffffffff)) optimization does not improve code. If the mul add two users, one of which was a sext.w, the mul would also be selected to a MULW before our pattern runs. This causes the ANDs to now be used by the already selected MULW and the mul we still need to select. They are unneeded on the MULW since MULW only reads the lower bits. So they get selected to SLLI+SRLI for the MULW use. The use for the (mul (and X, 0xffffffff), (and Y, 0xffffffff)) manages to reuse the SLLI. The end result is increased register pressure and no improvement to how soon we can start the MULW. --- llvm/test/CodeGen/RISCV/xaluo.ll | 567 +++++++++++++++++-------------- 1 file changed, 309 insertions(+), 258 deletions(-) diff --git a/llvm/test/CodeGen/RISCV/xaluo.ll b/llvm/test/CodeGen/RISCV/xaluo.ll index f34093e8d6f3..b535fd93be76 100644 --- a/llvm/test/CodeGen/RISCV/xaluo.ll +++ b/llvm/test/CodeGen/RISCV/xaluo.ll @@ -1031,6 +1031,57 @@ entry: ret i1 %obit } +; Similar to umulo.i32, but storing the overflow and returning the result. +define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, i32* %2) { +; RV32-LABEL: umulo3.i32: +; RV32: # %bb.0: +; RV32-NEXT: mul a3, a0, a1 +; RV32-NEXT: mulhu a0, a0, a1 +; RV32-NEXT: snez a0, a0 +; RV32-NEXT: sw a0, 0(a2) +; RV32-NEXT: mv a0, a3 +; RV32-NEXT: ret +; +; RV64-LABEL: umulo3.i32: +; RV64: # %bb.0: +; RV64-NEXT: slli a1, a1, 32 +; RV64-NEXT: srli a3, a1, 32 +; RV64-NEXT: slli a0, a0, 32 +; RV64-NEXT: srli a4, a0, 32 +; RV64-NEXT: mulhu a0, a0, a1 +; RV64-NEXT: srli a0, a0, 32 +; RV64-NEXT: snez a1, a0 +; RV64-NEXT: mulw a0, a4, a3 +; RV64-NEXT: sw a1, 0(a2) +; RV64-NEXT: ret +; +; RV32ZBA-LABEL: umulo3.i32: +; RV32ZBA: # %bb.0: +; RV32ZBA-NEXT: mul a3, a0, a1 +; RV32ZBA-NEXT: mulhu a0, a0, a1 +; RV32ZBA-NEXT: snez a0, a0 +; RV32ZBA-NEXT: sw a0, 0(a2) +; RV32ZBA-NEXT: mv a0, a3 +; RV32ZBA-NEXT: ret +; +; RV64ZBA-LABEL: umulo3.i32: +; RV64ZBA: # %bb.0: +; RV64ZBA-NEXT: zext.w a1, a1 +; RV64ZBA-NEXT: zext.w a0, a0 +; RV64ZBA-NEXT: mul a3, a0, a1 +; RV64ZBA-NEXT: srli a3, a3, 32 +; RV64ZBA-NEXT: snez a3, a3 +; RV64ZBA-NEXT: mulw a0, a0, a1 +; RV64ZBA-NEXT: sw a3, 0(a2) +; RV64ZBA-NEXT: ret + %4 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1) + %5 = extractvalue { i32, i1 } %4, 1 + %6 = extractvalue { i32, i1 } %4, 0 + %7 = zext i1 %5 to i32 + store i32 %7, i32* %2, align 4 + ret i32 %6 +} + define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) { ; RV32-LABEL: umulo.i64: ; RV32: # %bb.0: # %entry @@ -1175,10 +1226,10 @@ define i32 @saddo.select.i32(i32 %v1, i32 %v2) { ; RV32-NEXT: add a2, a0, a1 ; RV32-NEXT: slt a2, a2, a0 ; RV32-NEXT: slti a3, a1, 0 -; RV32-NEXT: bne a3, a2, .LBB22_2 +; RV32-NEXT: bne a3, a2, .LBB23_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: .LBB22_2: # %entry +; RV32-NEXT: .LBB23_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: saddo.select.i32: @@ -1187,10 +1238,10 @@ define i32 @saddo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: sext.w a3, a0 ; RV64-NEXT: add a4, a3, a2 ; RV64-NEXT: addw a2, a3, a2 -; RV64-NEXT: bne a2, a4, .LBB22_2 +; RV64-NEXT: bne a2, a4, .LBB23_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB22_2: # %entry +; RV64-NEXT: .LBB23_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: saddo.select.i32: @@ -1198,10 +1249,10 @@ define i32 @saddo.select.i32(i32 %v1, i32 %v2) { ; RV32ZBA-NEXT: add a2, a0, a1 ; RV32ZBA-NEXT: slt a2, a2, a0 ; RV32ZBA-NEXT: slti a3, a1, 0 -; RV32ZBA-NEXT: bne a3, a2, .LBB22_2 +; RV32ZBA-NEXT: bne a3, a2, .LBB23_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a1 -; RV32ZBA-NEXT: .LBB22_2: # %entry +; RV32ZBA-NEXT: .LBB23_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: saddo.select.i32: @@ -1210,10 +1261,10 @@ define i32 @saddo.select.i32(i32 %v1, i32 %v2) { ; RV64ZBA-NEXT: sext.w a3, a0 ; RV64ZBA-NEXT: add a4, a3, a2 ; RV64ZBA-NEXT: addw a2, a3, a2 -; RV64ZBA-NEXT: bne a2, a4, .LBB22_2 +; RV64ZBA-NEXT: bne a2, a4, .LBB23_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB22_2: # %entry +; RV64ZBA-NEXT: .LBB23_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) @@ -1278,11 +1329,11 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: xor a5, a1, a3 ; RV32-NEXT: not a5, a5 ; RV32-NEXT: and a4, a5, a4 -; RV32-NEXT: bltz a4, .LBB24_2 +; RV32-NEXT: bltz a4, .LBB25_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: .LBB24_2: # %entry +; RV32-NEXT: .LBB25_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: saddo.select.i64: @@ -1290,10 +1341,10 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: add a2, a0, a1 ; RV64-NEXT: slt a2, a2, a0 ; RV64-NEXT: slti a3, a1, 0 -; RV64-NEXT: bne a3, a2, .LBB24_2 +; RV64-NEXT: bne a3, a2, .LBB25_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB24_2: # %entry +; RV64-NEXT: .LBB25_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: saddo.select.i64: @@ -1306,11 +1357,11 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: xor a5, a1, a3 ; RV32ZBA-NEXT: not a5, a5 ; RV32ZBA-NEXT: and a4, a5, a4 -; RV32ZBA-NEXT: bltz a4, .LBB24_2 +; RV32ZBA-NEXT: bltz a4, .LBB25_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a2 ; RV32ZBA-NEXT: mv a1, a3 -; RV32ZBA-NEXT: .LBB24_2: # %entry +; RV32ZBA-NEXT: .LBB25_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: saddo.select.i64: @@ -1318,10 +1369,10 @@ define i64 @saddo.select.i64(i64 %v1, i64 %v2) { ; RV64ZBA-NEXT: add a2, a0, a1 ; RV64ZBA-NEXT: slt a2, a2, a0 ; RV64ZBA-NEXT: slti a3, a1, 0 -; RV64ZBA-NEXT: bne a3, a2, .LBB24_2 +; RV64ZBA-NEXT: bne a3, a2, .LBB25_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB24_2: # %entry +; RV64ZBA-NEXT: .LBB25_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) @@ -1387,39 +1438,39 @@ define i32 @uaddo.select.i32(i32 %v1, i32 %v2) { ; RV32-LABEL: uaddo.select.i32: ; RV32: # %bb.0: # %entry ; RV32-NEXT: add a2, a0, a1 -; RV32-NEXT: bltu a2, a0, .LBB26_2 +; RV32-NEXT: bltu a2, a0, .LBB27_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: .LBB26_2: # %entry +; RV32-NEXT: .LBB27_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: uaddo.select.i32: ; RV64: # %bb.0: # %entry ; RV64-NEXT: addw a2, a0, a1 ; RV64-NEXT: sext.w a3, a0 -; RV64-NEXT: bltu a2, a3, .LBB26_2 +; RV64-NEXT: bltu a2, a3, .LBB27_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB26_2: # %entry +; RV64-NEXT: .LBB27_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: uaddo.select.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: add a2, a0, a1 -; RV32ZBA-NEXT: bltu a2, a0, .LBB26_2 +; RV32ZBA-NEXT: bltu a2, a0, .LBB27_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a1 -; RV32ZBA-NEXT: .LBB26_2: # %entry +; RV32ZBA-NEXT: .LBB27_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: uaddo.select.i32: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: addw a2, a0, a1 ; RV64ZBA-NEXT: sext.w a3, a0 -; RV64ZBA-NEXT: bltu a2, a3, .LBB26_2 +; RV64ZBA-NEXT: bltu a2, a3, .LBB27_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB26_2: # %entry +; RV64ZBA-NEXT: .LBB27_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) @@ -1472,15 +1523,15 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: add a4, a0, a2 ; RV32-NEXT: sltu a4, a4, a0 ; RV32-NEXT: add a5, a5, a4 -; RV32-NEXT: bne a5, a1, .LBB28_3 +; RV32-NEXT: bne a5, a1, .LBB29_3 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: beqz a4, .LBB28_4 -; RV32-NEXT: .LBB28_2: # %entry +; RV32-NEXT: beqz a4, .LBB29_4 +; RV32-NEXT: .LBB29_2: # %entry ; RV32-NEXT: ret -; RV32-NEXT: .LBB28_3: # %entry +; RV32-NEXT: .LBB29_3: # %entry ; RV32-NEXT: sltu a4, a5, a1 -; RV32-NEXT: bnez a4, .LBB28_2 -; RV32-NEXT: .LBB28_4: # %entry +; RV32-NEXT: bnez a4, .LBB29_2 +; RV32-NEXT: .LBB29_4: # %entry ; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 ; RV32-NEXT: ret @@ -1488,10 +1539,10 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { ; RV64-LABEL: uaddo.select.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: add a2, a0, a1 -; RV64-NEXT: bltu a2, a0, .LBB28_2 +; RV64-NEXT: bltu a2, a0, .LBB29_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB28_2: # %entry +; RV64-NEXT: .LBB29_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: uaddo.select.i64: @@ -1500,15 +1551,15 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: add a4, a0, a2 ; RV32ZBA-NEXT: sltu a4, a4, a0 ; RV32ZBA-NEXT: add a5, a5, a4 -; RV32ZBA-NEXT: bne a5, a1, .LBB28_3 +; RV32ZBA-NEXT: bne a5, a1, .LBB29_3 ; RV32ZBA-NEXT: # %bb.1: # %entry -; RV32ZBA-NEXT: beqz a4, .LBB28_4 -; RV32ZBA-NEXT: .LBB28_2: # %entry +; RV32ZBA-NEXT: beqz a4, .LBB29_4 +; RV32ZBA-NEXT: .LBB29_2: # %entry ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB28_3: # %entry +; RV32ZBA-NEXT: .LBB29_3: # %entry ; RV32ZBA-NEXT: sltu a4, a5, a1 -; RV32ZBA-NEXT: bnez a4, .LBB28_2 -; RV32ZBA-NEXT: .LBB28_4: # %entry +; RV32ZBA-NEXT: bnez a4, .LBB29_2 +; RV32ZBA-NEXT: .LBB29_4: # %entry ; RV32ZBA-NEXT: mv a0, a2 ; RV32ZBA-NEXT: mv a1, a3 ; RV32ZBA-NEXT: ret @@ -1516,10 +1567,10 @@ define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { ; RV64ZBA-LABEL: uaddo.select.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: add a2, a0, a1 -; RV64ZBA-NEXT: bltu a2, a0, .LBB28_2 +; RV64ZBA-NEXT: bltu a2, a0, .LBB29_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB28_2: # %entry +; RV64ZBA-NEXT: .LBB29_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) @@ -1535,10 +1586,10 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: add a2, a3, a0 -; RV32-NEXT: beq a2, a1, .LBB29_2 +; RV32-NEXT: beq a2, a1, .LBB30_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a2, a1 -; RV32-NEXT: .LBB29_2: # %entry +; RV32-NEXT: .LBB30_2: # %entry ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: ret ; @@ -1555,10 +1606,10 @@ define i1 @uaddo.not.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: add a2, a0, a2 ; RV32ZBA-NEXT: sltu a0, a2, a0 ; RV32ZBA-NEXT: add a2, a3, a0 -; RV32ZBA-NEXT: beq a2, a1, .LBB29_2 +; RV32ZBA-NEXT: beq a2, a1, .LBB30_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a2, a1 -; RV32ZBA-NEXT: .LBB29_2: # %entry +; RV32ZBA-NEXT: .LBB30_2: # %entry ; RV32ZBA-NEXT: xori a0, a0, 1 ; RV32ZBA-NEXT: ret ; @@ -1581,10 +1632,10 @@ define i32 @ssubo.select.i32(i32 %v1, i32 %v2) { ; RV32-NEXT: sgtz a2, a1 ; RV32-NEXT: sub a3, a0, a1 ; RV32-NEXT: slt a3, a3, a0 -; RV32-NEXT: bne a2, a3, .LBB30_2 +; RV32-NEXT: bne a2, a3, .LBB31_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: .LBB30_2: # %entry +; RV32-NEXT: .LBB31_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: ssubo.select.i32: @@ -1593,10 +1644,10 @@ define i32 @ssubo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: sext.w a3, a0 ; RV64-NEXT: sub a4, a3, a2 ; RV64-NEXT: subw a2, a3, a2 -; RV64-NEXT: bne a2, a4, .LBB30_2 +; RV64-NEXT: bne a2, a4, .LBB31_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB30_2: # %entry +; RV64-NEXT: .LBB31_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: ssubo.select.i32: @@ -1604,10 +1655,10 @@ define i32 @ssubo.select.i32(i32 %v1, i32 %v2) { ; RV32ZBA-NEXT: sgtz a2, a1 ; RV32ZBA-NEXT: sub a3, a0, a1 ; RV32ZBA-NEXT: slt a3, a3, a0 -; RV32ZBA-NEXT: bne a2, a3, .LBB30_2 +; RV32ZBA-NEXT: bne a2, a3, .LBB31_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a1 -; RV32ZBA-NEXT: .LBB30_2: # %entry +; RV32ZBA-NEXT: .LBB31_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: ssubo.select.i32: @@ -1616,10 +1667,10 @@ define i32 @ssubo.select.i32(i32 %v1, i32 %v2) { ; RV64ZBA-NEXT: sext.w a3, a0 ; RV64ZBA-NEXT: sub a4, a3, a2 ; RV64ZBA-NEXT: subw a2, a3, a2 -; RV64ZBA-NEXT: bne a2, a4, .LBB30_2 +; RV64ZBA-NEXT: bne a2, a4, .LBB31_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB30_2: # %entry +; RV64ZBA-NEXT: .LBB31_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) @@ -1682,11 +1733,11 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: xor a4, a1, a4 ; RV32-NEXT: xor a5, a1, a3 ; RV32-NEXT: and a4, a5, a4 -; RV32-NEXT: bltz a4, .LBB32_2 +; RV32-NEXT: bltz a4, .LBB33_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: .LBB32_2: # %entry +; RV32-NEXT: .LBB33_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: ssubo.select.i64: @@ -1694,10 +1745,10 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: sgtz a2, a1 ; RV64-NEXT: sub a3, a0, a1 ; RV64-NEXT: slt a3, a3, a0 -; RV64-NEXT: bne a2, a3, .LBB32_2 +; RV64-NEXT: bne a2, a3, .LBB33_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB32_2: # %entry +; RV64-NEXT: .LBB33_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: ssubo.select.i64: @@ -1708,11 +1759,11 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: xor a4, a1, a4 ; RV32ZBA-NEXT: xor a5, a1, a3 ; RV32ZBA-NEXT: and a4, a5, a4 -; RV32ZBA-NEXT: bltz a4, .LBB32_2 +; RV32ZBA-NEXT: bltz a4, .LBB33_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a2 ; RV32ZBA-NEXT: mv a1, a3 -; RV32ZBA-NEXT: .LBB32_2: # %entry +; RV32ZBA-NEXT: .LBB33_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: ssubo.select.i64: @@ -1720,10 +1771,10 @@ define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { ; RV64ZBA-NEXT: sgtz a2, a1 ; RV64ZBA-NEXT: sub a3, a0, a1 ; RV64ZBA-NEXT: slt a3, a3, a0 -; RV64ZBA-NEXT: bne a2, a3, .LBB32_2 +; RV64ZBA-NEXT: bne a2, a3, .LBB33_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB32_2: # %entry +; RV64ZBA-NEXT: .LBB33_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) @@ -1785,39 +1836,39 @@ define i32 @usubo.select.i32(i32 %v1, i32 %v2) { ; RV32-LABEL: usubo.select.i32: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sub a2, a0, a1 -; RV32-NEXT: bltu a0, a2, .LBB34_2 +; RV32-NEXT: bltu a0, a2, .LBB35_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: .LBB34_2: # %entry +; RV32-NEXT: .LBB35_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: usubo.select.i32: ; RV64: # %bb.0: # %entry ; RV64-NEXT: subw a2, a0, a1 ; RV64-NEXT: sext.w a3, a0 -; RV64-NEXT: bltu a3, a2, .LBB34_2 +; RV64-NEXT: bltu a3, a2, .LBB35_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB34_2: # %entry +; RV64-NEXT: .LBB35_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: usubo.select.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sub a2, a0, a1 -; RV32ZBA-NEXT: bltu a0, a2, .LBB34_2 +; RV32ZBA-NEXT: bltu a0, a2, .LBB35_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a1 -; RV32ZBA-NEXT: .LBB34_2: # %entry +; RV32ZBA-NEXT: .LBB35_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: usubo.select.i32: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: subw a2, a0, a1 ; RV64ZBA-NEXT: sext.w a3, a0 -; RV64ZBA-NEXT: bltu a3, a2, .LBB34_2 +; RV64ZBA-NEXT: bltu a3, a2, .LBB35_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB34_2: # %entry +; RV64ZBA-NEXT: .LBB35_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) @@ -1869,28 +1920,28 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: sltu a4, a0, a2 ; RV32-NEXT: sub a5, a1, a3 ; RV32-NEXT: sub a4, a5, a4 -; RV32-NEXT: beq a4, a1, .LBB36_2 +; RV32-NEXT: beq a4, a1, .LBB37_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a4, a1, a4 -; RV32-NEXT: beqz a4, .LBB36_3 -; RV32-NEXT: j .LBB36_4 -; RV32-NEXT: .LBB36_2: +; RV32-NEXT: beqz a4, .LBB37_3 +; RV32-NEXT: j .LBB37_4 +; RV32-NEXT: .LBB37_2: ; RV32-NEXT: sub a4, a0, a2 ; RV32-NEXT: sltu a4, a0, a4 -; RV32-NEXT: bnez a4, .LBB36_4 -; RV32-NEXT: .LBB36_3: # %entry +; RV32-NEXT: bnez a4, .LBB37_4 +; RV32-NEXT: .LBB37_3: # %entry ; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: .LBB36_4: # %entry +; RV32-NEXT: .LBB37_4: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: usubo.select.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: sub a2, a0, a1 -; RV64-NEXT: bltu a0, a2, .LBB36_2 +; RV64-NEXT: bltu a0, a2, .LBB37_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB36_2: # %entry +; RV64-NEXT: .LBB37_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: usubo.select.i64: @@ -1898,28 +1949,28 @@ define i64 @usubo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: sltu a4, a0, a2 ; RV32ZBA-NEXT: sub a5, a1, a3 ; RV32ZBA-NEXT: sub a4, a5, a4 -; RV32ZBA-NEXT: beq a4, a1, .LBB36_2 +; RV32ZBA-NEXT: beq a4, a1, .LBB37_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a4, a1, a4 -; RV32ZBA-NEXT: beqz a4, .LBB36_3 -; RV32ZBA-NEXT: j .LBB36_4 -; RV32ZBA-NEXT: .LBB36_2: +; RV32ZBA-NEXT: beqz a4, .LBB37_3 +; RV32ZBA-NEXT: j .LBB37_4 +; RV32ZBA-NEXT: .LBB37_2: ; RV32ZBA-NEXT: sub a4, a0, a2 ; RV32ZBA-NEXT: sltu a4, a0, a4 -; RV32ZBA-NEXT: bnez a4, .LBB36_4 -; RV32ZBA-NEXT: .LBB36_3: # %entry +; RV32ZBA-NEXT: bnez a4, .LBB37_4 +; RV32ZBA-NEXT: .LBB37_3: # %entry ; RV32ZBA-NEXT: mv a0, a2 ; RV32ZBA-NEXT: mv a1, a3 -; RV32ZBA-NEXT: .LBB36_4: # %entry +; RV32ZBA-NEXT: .LBB37_4: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: usubo.select.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: sub a2, a0, a1 -; RV64ZBA-NEXT: bltu a0, a2, .LBB36_2 +; RV64ZBA-NEXT: bltu a0, a2, .LBB37_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB36_2: # %entry +; RV64ZBA-NEXT: .LBB37_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) @@ -1934,12 +1985,12 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: sltu a4, a0, a2 ; RV32-NEXT: sub a3, a1, a3 ; RV32-NEXT: sub a3, a3, a4 -; RV32-NEXT: beq a3, a1, .LBB37_2 +; RV32-NEXT: beq a3, a1, .LBB38_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a1, a3 ; RV32-NEXT: xori a0, a0, 1 ; RV32-NEXT: ret -; RV32-NEXT: .LBB37_2: +; RV32-NEXT: .LBB38_2: ; RV32-NEXT: sub a1, a0, a2 ; RV32-NEXT: sltu a0, a0, a1 ; RV32-NEXT: xori a0, a0, 1 @@ -1957,12 +2008,12 @@ define i1 @usubo.not.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: sltu a4, a0, a2 ; RV32ZBA-NEXT: sub a3, a1, a3 ; RV32ZBA-NEXT: sub a3, a3, a4 -; RV32ZBA-NEXT: beq a3, a1, .LBB37_2 +; RV32ZBA-NEXT: beq a3, a1, .LBB38_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a1, a3 ; RV32ZBA-NEXT: xori a0, a0, 1 ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB37_2: +; RV32ZBA-NEXT: .LBB38_2: ; RV32ZBA-NEXT: sub a1, a0, a2 ; RV32ZBA-NEXT: sltu a0, a0, a1 ; RV32ZBA-NEXT: xori a0, a0, 1 @@ -1987,10 +2038,10 @@ define i32 @smulo.select.i32(i32 %v1, i32 %v2) { ; RV32-NEXT: mulh a2, a0, a1 ; RV32-NEXT: mul a3, a0, a1 ; RV32-NEXT: srai a3, a3, 31 -; RV32-NEXT: bne a2, a3, .LBB38_2 +; RV32-NEXT: bne a2, a3, .LBB39_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: .LBB38_2: # %entry +; RV32-NEXT: .LBB39_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: smulo.select.i32: @@ -1999,10 +2050,10 @@ define i32 @smulo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: sext.w a3, a0 ; RV64-NEXT: mul a4, a3, a2 ; RV64-NEXT: mulw a2, a3, a2 -; RV64-NEXT: bne a2, a4, .LBB38_2 +; RV64-NEXT: bne a2, a4, .LBB39_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB38_2: # %entry +; RV64-NEXT: .LBB39_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: smulo.select.i32: @@ -2010,10 +2061,10 @@ define i32 @smulo.select.i32(i32 %v1, i32 %v2) { ; RV32ZBA-NEXT: mulh a2, a0, a1 ; RV32ZBA-NEXT: mul a3, a0, a1 ; RV32ZBA-NEXT: srai a3, a3, 31 -; RV32ZBA-NEXT: bne a2, a3, .LBB38_2 +; RV32ZBA-NEXT: bne a2, a3, .LBB39_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a1 -; RV32ZBA-NEXT: .LBB38_2: # %entry +; RV32ZBA-NEXT: .LBB39_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: smulo.select.i32: @@ -2022,10 +2073,10 @@ define i32 @smulo.select.i32(i32 %v1, i32 %v2) { ; RV64ZBA-NEXT: sext.w a3, a0 ; RV64ZBA-NEXT: mul a4, a3, a2 ; RV64ZBA-NEXT: mulw a2, a3, a2 -; RV64ZBA-NEXT: bne a2, a4, .LBB38_2 +; RV64ZBA-NEXT: bne a2, a4, .LBB39_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB38_2: # %entry +; RV64ZBA-NEXT: .LBB39_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) @@ -2102,11 +2153,11 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: addi a4, sp, 8 ; RV32-NEXT: call __mulodi4@plt ; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: bnez a0, .LBB40_2 +; RV32-NEXT: bnez a0, .LBB41_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv s1, s3 ; RV32-NEXT: mv s0, s2 -; RV32-NEXT: .LBB40_2: # %entry +; RV32-NEXT: .LBB41_2: # %entry ; RV32-NEXT: mv a0, s1 ; RV32-NEXT: mv a1, s0 ; RV32-NEXT: lw s3, 12(sp) # 4-byte Folded Reload @@ -2122,10 +2173,10 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mulh a2, a0, a1 ; RV64-NEXT: mul a3, a0, a1 ; RV64-NEXT: srai a3, a3, 63 -; RV64-NEXT: bne a2, a3, .LBB40_2 +; RV64-NEXT: bne a2, a3, .LBB41_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB40_2: # %entry +; RV64-NEXT: .LBB41_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: smulo.select.i64: @@ -2150,11 +2201,11 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: addi a4, sp, 8 ; RV32ZBA-NEXT: call __mulodi4@plt ; RV32ZBA-NEXT: lw a0, 8(sp) -; RV32ZBA-NEXT: bnez a0, .LBB40_2 +; RV32ZBA-NEXT: bnez a0, .LBB41_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv s1, s3 ; RV32ZBA-NEXT: mv s0, s2 -; RV32ZBA-NEXT: .LBB40_2: # %entry +; RV32ZBA-NEXT: .LBB41_2: # %entry ; RV32ZBA-NEXT: mv a0, s1 ; RV32ZBA-NEXT: mv a1, s0 ; RV32ZBA-NEXT: lw s3, 12(sp) # 4-byte Folded Reload @@ -2170,10 +2221,10 @@ define i64 @smulo.select.i64(i64 %v1, i64 %v2) { ; RV64ZBA-NEXT: mulh a2, a0, a1 ; RV64ZBA-NEXT: mul a3, a0, a1 ; RV64ZBA-NEXT: srai a3, a3, 63 -; RV64ZBA-NEXT: bne a2, a3, .LBB40_2 +; RV64ZBA-NEXT: bne a2, a3, .LBB41_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB40_2: # %entry +; RV64ZBA-NEXT: .LBB41_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) @@ -2241,10 +2292,10 @@ define i32 @umulo.select.i32(i32 %v1, i32 %v2) { ; RV32-LABEL: umulo.select.i32: ; RV32: # %bb.0: # %entry ; RV32-NEXT: mulhu a2, a0, a1 -; RV32-NEXT: bnez a2, .LBB42_2 +; RV32-NEXT: bnez a2, .LBB43_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a1 -; RV32-NEXT: .LBB42_2: # %entry +; RV32-NEXT: .LBB43_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: umulo.select.i32: @@ -2253,19 +2304,19 @@ define i32 @umulo.select.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: slli a3, a0, 32 ; RV64-NEXT: mulhu a2, a3, a2 ; RV64-NEXT: srli a2, a2, 32 -; RV64-NEXT: bnez a2, .LBB42_2 +; RV64-NEXT: bnez a2, .LBB43_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB42_2: # %entry +; RV64-NEXT: .LBB43_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: umulo.select.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mulhu a2, a0, a1 -; RV32ZBA-NEXT: bnez a2, .LBB42_2 +; RV32ZBA-NEXT: bnez a2, .LBB43_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a1 -; RV32ZBA-NEXT: .LBB42_2: # %entry +; RV32ZBA-NEXT: .LBB43_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo.select.i32: @@ -2274,10 +2325,10 @@ define i32 @umulo.select.i32(i32 %v1, i32 %v2) { ; RV64ZBA-NEXT: zext.w a3, a0 ; RV64ZBA-NEXT: mul a2, a3, a2 ; RV64ZBA-NEXT: srli a2, a2, 32 -; RV64ZBA-NEXT: bnez a2, .LBB42_2 +; RV64ZBA-NEXT: bnez a2, .LBB43_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB42_2: # %entry +; RV64ZBA-NEXT: .LBB43_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) @@ -2342,20 +2393,20 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: snez a5, a5 ; RV32-NEXT: or a4, a4, a5 ; RV32-NEXT: or a4, a4, a6 -; RV32-NEXT: bnez a4, .LBB44_2 +; RV32-NEXT: bnez a4, .LBB45_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: mv a0, a2 ; RV32-NEXT: mv a1, a3 -; RV32-NEXT: .LBB44_2: # %entry +; RV32-NEXT: .LBB45_2: # %entry ; RV32-NEXT: ret ; ; RV64-LABEL: umulo.select.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: mulhu a2, a0, a1 -; RV64-NEXT: bnez a2, .LBB44_2 +; RV64-NEXT: bnez a2, .LBB45_2 ; RV64-NEXT: # %bb.1: # %entry ; RV64-NEXT: mv a0, a1 -; RV64-NEXT: .LBB44_2: # %entry +; RV64-NEXT: .LBB45_2: # %entry ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: umulo.select.i64: @@ -2376,20 +2427,20 @@ define i64 @umulo.select.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: snez a5, a5 ; RV32ZBA-NEXT: or a4, a4, a5 ; RV32ZBA-NEXT: or a4, a4, a6 -; RV32ZBA-NEXT: bnez a4, .LBB44_2 +; RV32ZBA-NEXT: bnez a4, .LBB45_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: mv a0, a2 ; RV32ZBA-NEXT: mv a1, a3 -; RV32ZBA-NEXT: .LBB44_2: # %entry +; RV32ZBA-NEXT: .LBB45_2: # %entry ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo.select.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: mulhu a2, a0, a1 -; RV64ZBA-NEXT: bnez a2, .LBB44_2 +; RV64ZBA-NEXT: bnez a2, .LBB45_2 ; RV64ZBA-NEXT: # %bb.1: # %entry ; RV64ZBA-NEXT: mv a0, a1 -; RV64ZBA-NEXT: .LBB44_2: # %entry +; RV64ZBA-NEXT: .LBB45_2: # %entry ; RV64ZBA-NEXT: ret entry: %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) @@ -2469,11 +2520,11 @@ define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) { ; RV32-NEXT: add a2, a0, a1 ; RV32-NEXT: slt a0, a2, a0 ; RV32-NEXT: slti a1, a1, 0 -; RV32-NEXT: beq a1, a0, .LBB46_2 +; RV32-NEXT: beq a1, a0, .LBB47_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB46_2: # %continue +; RV32-NEXT: .LBB47_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; @@ -2483,11 +2534,11 @@ define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: add a2, a0, a1 ; RV64-NEXT: addw a0, a0, a1 -; RV64-NEXT: beq a0, a2, .LBB46_2 +; RV64-NEXT: beq a0, a2, .LBB47_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB46_2: # %continue +; RV64-NEXT: .LBB47_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -2496,11 +2547,11 @@ define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) { ; RV32ZBA-NEXT: add a2, a0, a1 ; RV32ZBA-NEXT: slt a0, a2, a0 ; RV32ZBA-NEXT: slti a1, a1, 0 -; RV32ZBA-NEXT: beq a1, a0, .LBB46_2 +; RV32ZBA-NEXT: beq a1, a0, .LBB47_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB46_2: # %continue +; RV32ZBA-NEXT: .LBB47_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; @@ -2510,11 +2561,11 @@ define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) { ; RV64ZBA-NEXT: sext.w a0, a0 ; RV64ZBA-NEXT: add a2, a0, a1 ; RV64ZBA-NEXT: addw a0, a0, a1 -; RV64ZBA-NEXT: beq a0, a2, .LBB46_2 +; RV64ZBA-NEXT: beq a0, a2, .LBB47_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB46_2: # %continue +; RV64ZBA-NEXT: .LBB47_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -2541,11 +2592,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: not a1, a1 ; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: bgez a0, .LBB47_2 +; RV32-NEXT: bgez a0, .LBB48_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB47_2: # %continue +; RV32-NEXT: .LBB48_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; @@ -2554,11 +2605,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: add a2, a0, a1 ; RV64-NEXT: slt a0, a2, a0 ; RV64-NEXT: slti a1, a1, 0 -; RV64-NEXT: beq a1, a0, .LBB47_2 +; RV64-NEXT: beq a1, a0, .LBB48_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB47_2: # %continue +; RV64-NEXT: .LBB48_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -2572,11 +2623,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: not a1, a1 ; RV32ZBA-NEXT: and a0, a1, a0 -; RV32ZBA-NEXT: bgez a0, .LBB47_2 +; RV32ZBA-NEXT: bgez a0, .LBB48_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB47_2: # %continue +; RV32ZBA-NEXT: .LBB48_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; @@ -2585,11 +2636,11 @@ define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { ; RV64ZBA-NEXT: add a2, a0, a1 ; RV64ZBA-NEXT: slt a0, a2, a0 ; RV64ZBA-NEXT: slti a1, a1, 0 -; RV64ZBA-NEXT: beq a1, a0, .LBB47_2 +; RV64ZBA-NEXT: beq a1, a0, .LBB48_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB47_2: # %continue +; RV64ZBA-NEXT: .LBB48_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -2609,11 +2660,11 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { ; RV32-LABEL: uaddo.br.i32: ; RV32: # %bb.0: # %entry ; RV32-NEXT: add a1, a0, a1 -; RV32-NEXT: bgeu a1, a0, .LBB48_2 +; RV32-NEXT: bgeu a1, a0, .LBB49_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB48_2: # %continue +; RV32-NEXT: .LBB49_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; @@ -2621,22 +2672,22 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { ; RV64: # %bb.0: # %entry ; RV64-NEXT: addw a1, a0, a1 ; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: bgeu a1, a0, .LBB48_2 +; RV64-NEXT: bgeu a1, a0, .LBB49_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB48_2: # %continue +; RV64-NEXT: .LBB49_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: uaddo.br.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: add a1, a0, a1 -; RV32ZBA-NEXT: bgeu a1, a0, .LBB48_2 +; RV32ZBA-NEXT: bgeu a1, a0, .LBB49_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB48_2: # %continue +; RV32ZBA-NEXT: .LBB49_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; @@ -2644,11 +2695,11 @@ define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: addw a1, a0, a1 ; RV64ZBA-NEXT: sext.w a0, a0 -; RV64ZBA-NEXT: bgeu a1, a0, .LBB48_2 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB49_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB48_2: # %continue +; RV64ZBA-NEXT: .LBB49_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -2671,26 +2722,26 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: add a2, a0, a2 ; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: add a2, a3, a0 -; RV32-NEXT: beq a2, a1, .LBB49_2 +; RV32-NEXT: beq a2, a1, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a2, a1 -; RV32-NEXT: .LBB49_2: # %entry -; RV32-NEXT: beqz a0, .LBB49_4 +; RV32-NEXT: .LBB50_2: # %entry +; RV32-NEXT: beqz a0, .LBB50_4 ; RV32-NEXT: # %bb.3: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB49_4: # %continue +; RV32-NEXT: .LBB50_4: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; ; RV64-LABEL: uaddo.br.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: add a1, a0, a1 -; RV64-NEXT: bgeu a1, a0, .LBB49_2 +; RV64-NEXT: bgeu a1, a0, .LBB50_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB49_2: # %continue +; RV64-NEXT: .LBB50_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -2700,26 +2751,26 @@ define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: add a2, a0, a2 ; RV32ZBA-NEXT: sltu a0, a2, a0 ; RV32ZBA-NEXT: add a2, a3, a0 -; RV32ZBA-NEXT: beq a2, a1, .LBB49_2 +; RV32ZBA-NEXT: beq a2, a1, .LBB50_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a2, a1 -; RV32ZBA-NEXT: .LBB49_2: # %entry -; RV32ZBA-NEXT: beqz a0, .LBB49_4 +; RV32ZBA-NEXT: .LBB50_2: # %entry +; RV32ZBA-NEXT: beqz a0, .LBB50_4 ; RV32ZBA-NEXT: # %bb.3: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB49_4: # %continue +; RV32ZBA-NEXT: .LBB50_4: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: uaddo.br.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: add a1, a0, a1 -; RV64ZBA-NEXT: bgeu a1, a0, .LBB49_2 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB50_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB49_2: # %continue +; RV64ZBA-NEXT: .LBB50_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -2741,11 +2792,11 @@ define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) { ; RV32-NEXT: sgtz a2, a1 ; RV32-NEXT: sub a1, a0, a1 ; RV32-NEXT: slt a0, a1, a0 -; RV32-NEXT: beq a2, a0, .LBB50_2 +; RV32-NEXT: beq a2, a0, .LBB51_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB50_2: # %continue +; RV32-NEXT: .LBB51_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; @@ -2755,11 +2806,11 @@ define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: sub a2, a0, a1 ; RV64-NEXT: subw a0, a0, a1 -; RV64-NEXT: beq a0, a2, .LBB50_2 +; RV64-NEXT: beq a0, a2, .LBB51_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB50_2: # %continue +; RV64-NEXT: .LBB51_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -2768,11 +2819,11 @@ define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) { ; RV32ZBA-NEXT: sgtz a2, a1 ; RV32ZBA-NEXT: sub a1, a0, a1 ; RV32ZBA-NEXT: slt a0, a1, a0 -; RV32ZBA-NEXT: beq a2, a0, .LBB50_2 +; RV32ZBA-NEXT: beq a2, a0, .LBB51_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB50_2: # %continue +; RV32ZBA-NEXT: .LBB51_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; @@ -2782,11 +2833,11 @@ define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) { ; RV64ZBA-NEXT: sext.w a0, a0 ; RV64ZBA-NEXT: sub a2, a0, a1 ; RV64ZBA-NEXT: subw a0, a0, a1 -; RV64ZBA-NEXT: beq a0, a2, .LBB50_2 +; RV64ZBA-NEXT: beq a0, a2, .LBB51_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB50_2: # %continue +; RV64ZBA-NEXT: .LBB51_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -2811,11 +2862,11 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: xor a0, a1, a0 ; RV32-NEXT: xor a1, a1, a3 ; RV32-NEXT: and a0, a1, a0 -; RV32-NEXT: bgez a0, .LBB51_2 +; RV32-NEXT: bgez a0, .LBB52_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB51_2: # %continue +; RV32-NEXT: .LBB52_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; @@ -2824,11 +2875,11 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: sgtz a2, a1 ; RV64-NEXT: sub a1, a0, a1 ; RV64-NEXT: slt a0, a1, a0 -; RV64-NEXT: beq a2, a0, .LBB51_2 +; RV64-NEXT: beq a2, a0, .LBB52_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB51_2: # %continue +; RV64-NEXT: .LBB52_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -2840,11 +2891,11 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: xor a0, a1, a0 ; RV32ZBA-NEXT: xor a1, a1, a3 ; RV32ZBA-NEXT: and a0, a1, a0 -; RV32ZBA-NEXT: bgez a0, .LBB51_2 +; RV32ZBA-NEXT: bgez a0, .LBB52_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB51_2: # %continue +; RV32ZBA-NEXT: .LBB52_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; @@ -2853,11 +2904,11 @@ define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { ; RV64ZBA-NEXT: sgtz a2, a1 ; RV64ZBA-NEXT: sub a1, a0, a1 ; RV64ZBA-NEXT: slt a0, a1, a0 -; RV64ZBA-NEXT: beq a2, a0, .LBB51_2 +; RV64ZBA-NEXT: beq a2, a0, .LBB52_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB51_2: # %continue +; RV64ZBA-NEXT: .LBB52_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -2877,11 +2928,11 @@ define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) { ; RV32-LABEL: usubo.br.i32: ; RV32: # %bb.0: # %entry ; RV32-NEXT: sub a1, a0, a1 -; RV32-NEXT: bgeu a0, a1, .LBB52_2 +; RV32-NEXT: bgeu a0, a1, .LBB53_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB52_2: # %continue +; RV32-NEXT: .LBB53_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; @@ -2889,22 +2940,22 @@ define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) { ; RV64: # %bb.0: # %entry ; RV64-NEXT: subw a1, a0, a1 ; RV64-NEXT: sext.w a0, a0 -; RV64-NEXT: bgeu a0, a1, .LBB52_2 +; RV64-NEXT: bgeu a0, a1, .LBB53_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB52_2: # %continue +; RV64-NEXT: .LBB53_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: usubo.br.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: sub a1, a0, a1 -; RV32ZBA-NEXT: bgeu a0, a1, .LBB52_2 +; RV32ZBA-NEXT: bgeu a0, a1, .LBB53_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB52_2: # %continue +; RV32ZBA-NEXT: .LBB53_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; @@ -2912,11 +2963,11 @@ define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) { ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: subw a1, a0, a1 ; RV64ZBA-NEXT: sext.w a0, a0 -; RV64ZBA-NEXT: bgeu a0, a1, .LBB52_2 +; RV64ZBA-NEXT: bgeu a0, a1, .LBB53_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB52_2: # %continue +; RV64ZBA-NEXT: .LBB53_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -2938,29 +2989,29 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: sltu a4, a0, a2 ; RV32-NEXT: sub a3, a1, a3 ; RV32-NEXT: sub a3, a3, a4 -; RV32-NEXT: beq a3, a1, .LBB53_3 +; RV32-NEXT: beq a3, a1, .LBB54_3 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a1, a3 -; RV32-NEXT: bnez a0, .LBB53_4 -; RV32-NEXT: .LBB53_2: # %continue +; RV32-NEXT: bnez a0, .LBB54_4 +; RV32-NEXT: .LBB54_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret -; RV32-NEXT: .LBB53_3: +; RV32-NEXT: .LBB54_3: ; RV32-NEXT: sub a1, a0, a2 ; RV32-NEXT: sltu a0, a0, a1 -; RV32-NEXT: beqz a0, .LBB53_2 -; RV32-NEXT: .LBB53_4: # %overflow +; RV32-NEXT: beqz a0, .LBB54_2 +; RV32-NEXT: .LBB54_4: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret ; ; RV64-LABEL: usubo.br.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: sub a1, a0, a1 -; RV64-NEXT: bgeu a0, a1, .LBB53_2 +; RV64-NEXT: bgeu a0, a1, .LBB54_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB53_2: # %continue +; RV64-NEXT: .LBB54_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -2969,29 +3020,29 @@ define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: sltu a4, a0, a2 ; RV32ZBA-NEXT: sub a3, a1, a3 ; RV32ZBA-NEXT: sub a3, a3, a4 -; RV32ZBA-NEXT: beq a3, a1, .LBB53_3 +; RV32ZBA-NEXT: beq a3, a1, .LBB54_3 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a1, a3 -; RV32ZBA-NEXT: bnez a0, .LBB53_4 -; RV32ZBA-NEXT: .LBB53_2: # %continue +; RV32ZBA-NEXT: bnez a0, .LBB54_4 +; RV32ZBA-NEXT: .LBB54_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB53_3: +; RV32ZBA-NEXT: .LBB54_3: ; RV32ZBA-NEXT: sub a1, a0, a2 ; RV32ZBA-NEXT: sltu a0, a0, a1 -; RV32ZBA-NEXT: beqz a0, .LBB53_2 -; RV32ZBA-NEXT: .LBB53_4: # %overflow +; RV32ZBA-NEXT: beqz a0, .LBB54_2 +; RV32ZBA-NEXT: .LBB54_4: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: usubo.br.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: sub a1, a0, a1 -; RV64ZBA-NEXT: bgeu a0, a1, .LBB53_2 +; RV64ZBA-NEXT: bgeu a0, a1, .LBB54_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB53_2: # %continue +; RV64ZBA-NEXT: .LBB54_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -3013,11 +3064,11 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) { ; RV32-NEXT: mulh a2, a0, a1 ; RV32-NEXT: mul a0, a0, a1 ; RV32-NEXT: srai a0, a0, 31 -; RV32-NEXT: beq a2, a0, .LBB54_2 +; RV32-NEXT: beq a2, a0, .LBB55_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB54_2: # %continue +; RV32-NEXT: .LBB55_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; @@ -3027,11 +3078,11 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: sext.w a0, a0 ; RV64-NEXT: mul a2, a0, a1 ; RV64-NEXT: mulw a0, a0, a1 -; RV64-NEXT: beq a0, a2, .LBB54_2 +; RV64-NEXT: beq a0, a2, .LBB55_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB54_2: # %continue +; RV64-NEXT: .LBB55_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -3040,11 +3091,11 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) { ; RV32ZBA-NEXT: mulh a2, a0, a1 ; RV32ZBA-NEXT: mul a0, a0, a1 ; RV32ZBA-NEXT: srai a0, a0, 31 -; RV32ZBA-NEXT: beq a2, a0, .LBB54_2 +; RV32ZBA-NEXT: beq a2, a0, .LBB55_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB54_2: # %continue +; RV32ZBA-NEXT: .LBB55_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; @@ -3054,11 +3105,11 @@ define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) { ; RV64ZBA-NEXT: sext.w a0, a0 ; RV64ZBA-NEXT: mul a2, a0, a1 ; RV64ZBA-NEXT: mulw a0, a0, a1 -; RV64ZBA-NEXT: beq a0, a2, .LBB54_2 +; RV64ZBA-NEXT: beq a0, a2, .LBB55_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB54_2: # %continue +; RV64ZBA-NEXT: .LBB55_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -3085,13 +3136,13 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: addi a4, sp, 8 ; RV32-NEXT: call __mulodi4@plt ; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: beqz a0, .LBB55_2 +; RV32-NEXT: beqz a0, .LBB56_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero -; RV32-NEXT: j .LBB55_3 -; RV32-NEXT: .LBB55_2: # %continue +; RV32-NEXT: j .LBB56_3 +; RV32-NEXT: .LBB56_2: # %continue ; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: .LBB55_3: # %overflow +; RV32-NEXT: .LBB56_3: # %overflow ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -3101,11 +3152,11 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV64-NEXT: mulh a2, a0, a1 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: beq a2, a0, .LBB55_2 +; RV64-NEXT: beq a2, a0, .LBB56_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB55_2: # %continue +; RV64-NEXT: .LBB56_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -3119,13 +3170,13 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: addi a4, sp, 8 ; RV32ZBA-NEXT: call __mulodi4@plt ; RV32ZBA-NEXT: lw a0, 8(sp) -; RV32ZBA-NEXT: beqz a0, .LBB55_2 +; RV32ZBA-NEXT: beqz a0, .LBB56_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero -; RV32ZBA-NEXT: j .LBB55_3 -; RV32ZBA-NEXT: .LBB55_2: # %continue +; RV32ZBA-NEXT: j .LBB56_3 +; RV32ZBA-NEXT: .LBB56_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 -; RV32ZBA-NEXT: .LBB55_3: # %overflow +; RV32ZBA-NEXT: .LBB56_3: # %overflow ; RV32ZBA-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret @@ -3135,11 +3186,11 @@ define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { ; RV64ZBA-NEXT: mulh a2, a0, a1 ; RV64ZBA-NEXT: mul a0, a0, a1 ; RV64ZBA-NEXT: srai a0, a0, 63 -; RV64ZBA-NEXT: beq a2, a0, .LBB55_2 +; RV64ZBA-NEXT: beq a2, a0, .LBB56_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB55_2: # %continue +; RV64ZBA-NEXT: .LBB56_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -3168,13 +3219,13 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV32-NEXT: addi a4, sp, 8 ; RV32-NEXT: call __mulodi4@plt ; RV32-NEXT: lw a0, 8(sp) -; RV32-NEXT: beqz a0, .LBB56_2 +; RV32-NEXT: beqz a0, .LBB57_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero -; RV32-NEXT: j .LBB56_3 -; RV32-NEXT: .LBB56_2: # %continue +; RV32-NEXT: j .LBB57_3 +; RV32-NEXT: .LBB57_2: # %continue ; RV32-NEXT: addi a0, zero, 1 -; RV32-NEXT: .LBB56_3: # %overflow +; RV32-NEXT: .LBB57_3: # %overflow ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret @@ -3185,11 +3236,11 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV64-NEXT: mulh a2, a0, a1 ; RV64-NEXT: mul a0, a0, a1 ; RV64-NEXT: srai a0, a0, 63 -; RV64-NEXT: beq a2, a0, .LBB56_2 +; RV64-NEXT: beq a2, a0, .LBB57_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB56_2: # %continue +; RV64-NEXT: .LBB57_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -3205,13 +3256,13 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV32ZBA-NEXT: addi a4, sp, 8 ; RV32ZBA-NEXT: call __mulodi4@plt ; RV32ZBA-NEXT: lw a0, 8(sp) -; RV32ZBA-NEXT: beqz a0, .LBB56_2 +; RV32ZBA-NEXT: beqz a0, .LBB57_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero -; RV32ZBA-NEXT: j .LBB56_3 -; RV32ZBA-NEXT: .LBB56_2: # %continue +; RV32ZBA-NEXT: j .LBB57_3 +; RV32ZBA-NEXT: .LBB57_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 -; RV32ZBA-NEXT: .LBB56_3: # %overflow +; RV32ZBA-NEXT: .LBB57_3: # %overflow ; RV32ZBA-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32ZBA-NEXT: addi sp, sp, 16 ; RV32ZBA-NEXT: ret @@ -3222,11 +3273,11 @@ define zeroext i1 @smulo2.br.i64(i64 %v1) { ; RV64ZBA-NEXT: mulh a2, a0, a1 ; RV64ZBA-NEXT: mul a0, a0, a1 ; RV64ZBA-NEXT: srai a0, a0, 63 -; RV64ZBA-NEXT: beq a2, a0, .LBB56_2 +; RV64ZBA-NEXT: beq a2, a0, .LBB57_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB56_2: # %continue +; RV64ZBA-NEXT: .LBB57_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -3246,11 +3297,11 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { ; RV32-LABEL: umulo.br.i32: ; RV32: # %bb.0: # %entry ; RV32-NEXT: mulhu a0, a0, a1 -; RV32-NEXT: beqz a0, .LBB57_2 +; RV32-NEXT: beqz a0, .LBB58_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB57_2: # %continue +; RV32-NEXT: .LBB58_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; @@ -3260,22 +3311,22 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { ; RV64-NEXT: slli a0, a0, 32 ; RV64-NEXT: mulhu a0, a0, a1 ; RV64-NEXT: srli a0, a0, 32 -; RV64-NEXT: beqz a0, .LBB57_2 +; RV64-NEXT: beqz a0, .LBB58_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB57_2: # %continue +; RV64-NEXT: .LBB58_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; ; RV32ZBA-LABEL: umulo.br.i32: ; RV32ZBA: # %bb.0: # %entry ; RV32ZBA-NEXT: mulhu a0, a0, a1 -; RV32ZBA-NEXT: beqz a0, .LBB57_2 +; RV32ZBA-NEXT: beqz a0, .LBB58_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB57_2: # %continue +; RV32ZBA-NEXT: .LBB58_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; @@ -3285,11 +3336,11 @@ define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { ; RV64ZBA-NEXT: zext.w a0, a0 ; RV64ZBA-NEXT: mul a0, a0, a1 ; RV64ZBA-NEXT: srli a0, a0, 32 -; RV64ZBA-NEXT: beqz a0, .LBB57_2 +; RV64ZBA-NEXT: beqz a0, .LBB58_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB57_2: # %continue +; RV64ZBA-NEXT: .LBB58_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -3324,22 +3375,22 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { ; RV32-NEXT: snez a0, a0 ; RV32-NEXT: or a0, a1, a0 ; RV32-NEXT: or a0, a0, a6 -; RV32-NEXT: beqz a0, .LBB58_2 +; RV32-NEXT: beqz a0, .LBB59_2 ; RV32-NEXT: # %bb.1: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB58_2: # %continue +; RV32-NEXT: .LBB59_2: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; ; RV64-LABEL: umulo.br.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: mulhu a0, a0, a1 -; RV64-NEXT: beqz a0, .LBB58_2 +; RV64-NEXT: beqz a0, .LBB59_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB58_2: # %continue +; RV64-NEXT: .LBB59_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -3361,22 +3412,22 @@ define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { ; RV32ZBA-NEXT: snez a0, a0 ; RV32ZBA-NEXT: or a0, a1, a0 ; RV32ZBA-NEXT: or a0, a0, a6 -; RV32ZBA-NEXT: beqz a0, .LBB58_2 +; RV32ZBA-NEXT: beqz a0, .LBB59_2 ; RV32ZBA-NEXT: # %bb.1: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB58_2: # %continue +; RV32ZBA-NEXT: .LBB59_2: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo.br.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: mulhu a0, a0, a1 -; RV64ZBA-NEXT: beqz a0, .LBB58_2 +; RV64ZBA-NEXT: beqz a0, .LBB59_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB58_2: # %continue +; RV64ZBA-NEXT: .LBB59_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: @@ -3399,26 +3450,26 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) { ; RV32-NEXT: sltu a0, a2, a0 ; RV32-NEXT: add a2, a1, a1 ; RV32-NEXT: add a2, a2, a0 -; RV32-NEXT: beq a2, a1, .LBB59_2 +; RV32-NEXT: beq a2, a1, .LBB60_2 ; RV32-NEXT: # %bb.1: # %entry ; RV32-NEXT: sltu a0, a2, a1 -; RV32-NEXT: .LBB59_2: # %entry -; RV32-NEXT: beqz a0, .LBB59_4 +; RV32-NEXT: .LBB60_2: # %entry +; RV32-NEXT: beqz a0, .LBB60_4 ; RV32-NEXT: # %bb.3: # %overflow ; RV32-NEXT: mv a0, zero ; RV32-NEXT: ret -; RV32-NEXT: .LBB59_4: # %continue +; RV32-NEXT: .LBB60_4: # %continue ; RV32-NEXT: addi a0, zero, 1 ; RV32-NEXT: ret ; ; RV64-LABEL: umulo2.br.i64: ; RV64: # %bb.0: # %entry ; RV64-NEXT: add a1, a0, a0 -; RV64-NEXT: bgeu a1, a0, .LBB59_2 +; RV64-NEXT: bgeu a1, a0, .LBB60_2 ; RV64-NEXT: # %bb.1: # %overflow ; RV64-NEXT: mv a0, zero ; RV64-NEXT: ret -; RV64-NEXT: .LBB59_2: # %continue +; RV64-NEXT: .LBB60_2: # %continue ; RV64-NEXT: addi a0, zero, 1 ; RV64-NEXT: ret ; @@ -3428,26 +3479,26 @@ define zeroext i1 @umulo2.br.i64(i64 %v1) { ; RV32ZBA-NEXT: sltu a0, a2, a0 ; RV32ZBA-NEXT: add a2, a1, a1 ; RV32ZBA-NEXT: add a2, a2, a0 -; RV32ZBA-NEXT: beq a2, a1, .LBB59_2 +; RV32ZBA-NEXT: beq a2, a1, .LBB60_2 ; RV32ZBA-NEXT: # %bb.1: # %entry ; RV32ZBA-NEXT: sltu a0, a2, a1 -; RV32ZBA-NEXT: .LBB59_2: # %entry -; RV32ZBA-NEXT: beqz a0, .LBB59_4 +; RV32ZBA-NEXT: .LBB60_2: # %entry +; RV32ZBA-NEXT: beqz a0, .LBB60_4 ; RV32ZBA-NEXT: # %bb.3: # %overflow ; RV32ZBA-NEXT: mv a0, zero ; RV32ZBA-NEXT: ret -; RV32ZBA-NEXT: .LBB59_4: # %continue +; RV32ZBA-NEXT: .LBB60_4: # %continue ; RV32ZBA-NEXT: addi a0, zero, 1 ; RV32ZBA-NEXT: ret ; ; RV64ZBA-LABEL: umulo2.br.i64: ; RV64ZBA: # %bb.0: # %entry ; RV64ZBA-NEXT: add a1, a0, a0 -; RV64ZBA-NEXT: bgeu a1, a0, .LBB59_2 +; RV64ZBA-NEXT: bgeu a1, a0, .LBB60_2 ; RV64ZBA-NEXT: # %bb.1: # %overflow ; RV64ZBA-NEXT: mv a0, zero ; RV64ZBA-NEXT: ret -; RV64ZBA-NEXT: .LBB59_2: # %continue +; RV64ZBA-NEXT: .LBB60_2: # %continue ; RV64ZBA-NEXT: addi a0, zero, 1 ; RV64ZBA-NEXT: ret entry: -- GitLab From 06d6b1471eb809aaad0681e1eb88727ac8225d47 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 18:50:14 -0700 Subject: [PATCH 0272/1000] [Driver] Gnu.cpp: remove unneeded -L lib/gcc/$triple/$version/../../../$triple After path resolution, it duplicates a subsequent -L entry. The entry below (lib/gcc/$triple/$version/../../../../$OSLibDir) usually does not exist (e.g. Arch Linux; Debian cross gcc). When it exists, it typically just has ld.so (e.g. Debian native gcc) which cannot cause collision. Removing the -L (similar to reordering it) is therefore justified. --- clang/lib/Driver/ToolChains/Gnu.cpp | 4 +- clang/test/Driver/linux-ld.c | 44 +++++++------------- clang/test/Driver/mips-reduced-toolchain.cpp | 2 - 3 files changed, 16 insertions(+), 34 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 3491a29a5f9c..f9df2370266c 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2843,10 +2843,8 @@ void Generic_GCC::AddMultilibPaths(const Driver &D, // the cross. Note that GCC does include some of these directories in some // configurations but this seems somewhere between questionable and simply // a bug. - if (StringRef(LibPath).startswith(SysRoot)) { - addPathIfExists(D, LibPath + "/" + MultiarchTriple, Paths); + if (StringRef(LibPath).startswith(SysRoot)) addPathIfExists(D, LibPath + "/../" + OSLibDir, Paths); - } } } diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c index 8ba57a941443..a07b289540ec 100644 --- a/clang/test/Driver/linux-ld.c +++ b/clang/test/Driver/linux-ld.c @@ -572,16 +572,15 @@ // RUN: --sysroot=%S/Inputs/ubuntu_12.04_LTS_multiarch_tree \ // RUN: | FileCheck --check-prefix=CHECK-UBUNTU-12-04-ARM-HF %s // CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}/usr/lib/gcc/arm-linux-gnueabihf/4.6.3/../../../arm-linux-gnueabihf{{/|\\\\}}crt1.o" -// CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}/usr/lib/gcc/arm-linux-gnueabihf/4.6.3/../../../arm-linux-gnueabihf{{/|\\\\}}crti.o" +// CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}/usr/lib/arm-linux-gnueabihf{{/|\\\\}}crt1.o" +// CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}/usr/lib/arm-linux-gnueabihf{{/|\\\\}}crti.o" // CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}/usr/lib/gcc/arm-linux-gnueabihf/4.6.3{{/|\\\\}}crtbegin.o" // CHECK-UBUNTU-12-04-ARM-HF: "-L[[SYSROOT]]/usr/lib/gcc/arm-linux-gnueabihf/4.6.3" -// CHECK-UBUNTU-12-04-ARM-HF: "-L[[SYSROOT]]/usr/lib/gcc/arm-linux-gnueabihf/4.6.3/../../../arm-linux-gnueabihf" // CHECK-UBUNTU-12-04-ARM-HF: "-L[[SYSROOT]]/lib/arm-linux-gnueabihf" // CHECK-UBUNTU-12-04-ARM-HF: "-L[[SYSROOT]]/usr/lib/arm-linux-gnueabihf" // CHECK-UBUNTU-12-04-ARM-HF: "-L[[SYSROOT]]/usr/lib/gcc/arm-linux-gnueabihf/4.6.3/../../.." // CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}/usr/lib/gcc/arm-linux-gnueabihf/4.6.3{{/|\\\\}}crtend.o" -// CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}/usr/lib/gcc/arm-linux-gnueabihf/4.6.3/../../../arm-linux-gnueabihf{{/|\\\\}}crtn.o" +// CHECK-UBUNTU-12-04-ARM-HF: "{{.*}}/usr/lib/arm-linux-gnueabihf{{/|\\\\}}crtn.o" // // Check Ubuntu 13.10 on x86-64 targeting arm-linux-gnueabihf. // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ @@ -628,16 +627,15 @@ // RUN: --sysroot=%S/Inputs/ubuntu_14.04_multiarch_tree \ // RUN: | FileCheck --check-prefix=CHECK-UBUNTU-14-04-PPC64LE %s // CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}/usr/lib/gcc/powerpc64le-linux-gnu/4.8/../../../powerpc64le-linux-gnu{{/|\\\\}}crt1.o" -// CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}/usr/lib/gcc/powerpc64le-linux-gnu/4.8/../../../powerpc64le-linux-gnu{{/|\\\\}}crti.o" +// CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}/usr/lib/powerpc64le-linux-gnu{{/|\\\\}}crt1.o" +// CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}/usr/lib/powerpc64le-linux-gnu{{/|\\\\}}crti.o" // CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}/usr/lib/gcc/powerpc64le-linux-gnu/4.8{{/|\\\\}}crtbegin.o" // CHECK-UBUNTU-14-04-PPC64LE: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64le-linux-gnu/4.8" -// CHECK-UBUNTU-14-04-PPC64LE: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64le-linux-gnu/4.8/../../../powerpc64le-linux-gnu" // CHECK-UBUNTU-14-04-PPC64LE: "-L[[SYSROOT]]/lib/powerpc64le-linux-gnu" // CHECK-UBUNTU-14-04-PPC64LE: "-L[[SYSROOT]]/usr/lib/powerpc64le-linux-gnu" // CHECK-UBUNTU-14-04-PPC64LE: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64le-linux-gnu/4.8/../../.." // CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}/usr/lib/gcc/powerpc64le-linux-gnu/4.8{{/|\\\\}}crtend.o" -// CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}/usr/lib/gcc/powerpc64le-linux-gnu/4.8/../../../powerpc64le-linux-gnu{{/|\\\\}}crtn.o" +// CHECK-UBUNTU-14-04-PPC64LE: "{{.*}}/usr/lib/powerpc64le-linux-gnu{{/|\\\\}}crtn.o" // // Check Ubuntu 14.04 on x32. // "/usr/lib/gcc/x86_64-linux-gnu/4.8/x32/crtend.o" "/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../libx32/crtn.o" @@ -716,16 +714,15 @@ // RUN: --sysroot=%S/Inputs/ubuntu_12.04_LTS_multiarch_tree \ // RUN: | FileCheck --check-prefix=CHECK-UBUNTU-12-04-ARM %s // CHECK-UBUNTU-12-04-ARM: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-UBUNTU-12-04-ARM: "{{.*}}/usr/lib/gcc/arm-linux-gnueabi/4.6.1/../../../arm-linux-gnueabi{{/|\\\\}}crt1.o" -// CHECK-UBUNTU-12-04-ARM: "{{.*}}/usr/lib/gcc/arm-linux-gnueabi/4.6.1/../../../arm-linux-gnueabi{{/|\\\\}}crti.o" +// CHECK-UBUNTU-12-04-ARM: "{{.*}}/usr/lib/arm-linux-gnueabi{{/|\\\\}}crt1.o" +// CHECK-UBUNTU-12-04-ARM: "{{.*}}/usr/lib/arm-linux-gnueabi{{/|\\\\}}crti.o" // CHECK-UBUNTU-12-04-ARM: "{{.*}}/usr/lib/gcc/arm-linux-gnueabi/4.6.1{{/|\\\\}}crtbegin.o" // CHECK-UBUNTU-12-04-ARM: "-L[[SYSROOT]]/usr/lib/gcc/arm-linux-gnueabi/4.6.1" -// CHECK-UBUNTU-12-04-ARM: "-L[[SYSROOT]]/usr/lib/gcc/arm-linux-gnueabi/4.6.1/../../../arm-linux-gnueabi" // CHECK-UBUNTU-12-04-ARM: "-L[[SYSROOT]]/lib/arm-linux-gnueabi" // CHECK-UBUNTU-12-04-ARM: "-L[[SYSROOT]]/usr/lib/arm-linux-gnueabi" // CHECK-UBUNTU-12-04-ARM: "-L[[SYSROOT]]/usr/lib/gcc/arm-linux-gnueabi/4.6.1/../../.." // CHECK-UBUNTU-12-04-ARM: "{{.*}}/usr/lib/gcc/arm-linux-gnueabi/4.6.1{{/|\\\\}}crtend.o" -// CHECK-UBUNTU-12-04-ARM: "{{.*}}/usr/lib/gcc/arm-linux-gnueabi/4.6.1/../../../arm-linux-gnueabi{{/|\\\\}}crtn.o" +// CHECK-UBUNTU-12-04-ARM: "{{.*}}/usr/lib/arm-linux-gnueabi{{/|\\\\}}crtn.o" // // Test the setup that shipped in SUSE 10.3 on ppc64. // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ @@ -1074,7 +1071,6 @@ // CHECK-DEBIAN-X86: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-X86: "{{.*}}/usr/lib/gcc/i686-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-X86: "-L[[SYSROOT]]/usr/lib/gcc/i686-linux-gnu/4.5" -// CHECK-DEBIAN-X86: "-L[[SYSROOT]]/usr/lib/gcc/i686-linux-gnu/4.5/../../../i386-linux-gnu" // CHECK-DEBIAN-X86: "-L[[SYSROOT]]/usr/lib/i386-linux-gnu" // CHECK-DEBIAN-X86: "-L[[SYSROOT]]/usr/lib/gcc/i686-linux-gnu/4.5/../../.." // CHECK-DEBIAN-X86: "-L[[SYSROOT]]/lib" @@ -1087,7 +1083,6 @@ // CHECK-DEBIAN-X86-64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-X86-64: "{{.*}}/usr/lib/gcc/x86_64-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-X86-64: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.5" -// CHECK-DEBIAN-X86-64: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.5/../../../x86_64-linux-gnu" // CHECK-DEBIAN-X86-64: "-L[[SYSROOT]]/usr/lib/x86_64-linux-gnu" // CHECK-DEBIAN-X86-64: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.5/../../.." // CHECK-DEBIAN-X86-64: "-L[[SYSROOT]]/lib" @@ -1100,7 +1095,6 @@ // CHECK-DEBIAN-PPC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-PPC: "{{.*}}/usr/lib/gcc/powerpc-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-PPC: "-L[[SYSROOT]]/usr/lib/gcc/powerpc-linux-gnu/4.5" -// CHECK-DEBIAN-PPC: "-L[[SYSROOT]]/usr/lib/gcc/powerpc-linux-gnu/4.5/../../../powerpc-linux-gnu" // CHECK-DEBIAN-PPC: "-L[[SYSROOT]]/usr/lib/powerpc-linux-gnu" // CHECK-DEBIAN-PPC: "-L[[SYSROOT]]/usr/lib/gcc/powerpc-linux-gnu/4.5/../../.." // CHECK-DEBIAN-PPC: "-L[[SYSROOT]]/lib" @@ -1113,7 +1107,6 @@ // CHECK-DEBIAN-PPC64LE: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-PPC64LE: "{{.*}}/usr/lib/gcc/powerpc64le-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-PPC64LE: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64le-linux-gnu/4.5" -// CHECK-DEBIAN-PPC64LE: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64le-linux-gnu/4.5/../../../powerpc64le-linux-gnu" // CHECK-DEBIAN-PPC64LE: "-L[[SYSROOT]]/usr/lib/powerpc64le-linux-gnu" // CHECK-DEBIAN-PPC64LE: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64le-linux-gnu/4.5/../../.." // CHECK-DEBIAN-PPC64LE: "-L[[SYSROOT]]/lib" @@ -1126,7 +1119,6 @@ // CHECK-DEBIAN-PPC64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-PPC64: "{{.*}}/usr/lib/gcc/powerpc64-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-PPC64: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64-linux-gnu/4.5" -// CHECK-DEBIAN-PPC64: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64-linux-gnu/4.5/../../../powerpc64-linux-gnu" // CHECK-DEBIAN-PPC64: "-L[[SYSROOT]]/usr/lib/powerpc64-linux-gnu" // CHECK-DEBIAN-PPC64: "-L[[SYSROOT]]/usr/lib/gcc/powerpc64-linux-gnu/4.5/../../.." // CHECK-DEBIAN-PPC64: "-L[[SYSROOT]]/lib" @@ -1139,7 +1131,6 @@ // CHECK-DEBIAN-MIPS: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-MIPS: "{{.*}}/usr/lib/gcc/mips-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/usr/lib/gcc/mips-linux-gnu/4.5" -// CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/usr/lib/gcc/mips-linux-gnu/4.5/../../../mips-linux-gnu" // CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/usr/lib/mips-linux-gnu" // CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/usr/lib/gcc/mips-linux-gnu/4.5/../../.." // CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/lib" @@ -1152,7 +1143,6 @@ // CHECK-DEBIAN-MIPSEL: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-MIPSEL: "{{.*}}/usr/lib/gcc/mipsel-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/usr/lib/gcc/mipsel-linux-gnu/4.5" -// CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/usr/lib/gcc/mipsel-linux-gnu/4.5/../../../mipsel-linux-gnu" // CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/usr/lib/mipsel-linux-gnu" // CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/usr/lib/gcc/mipsel-linux-gnu/4.5/../../.." // CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/lib" @@ -1213,7 +1203,6 @@ // CHECK-DEBIAN-SPARC: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-SPARC-SAME: "{{.*}}/usr/lib/gcc/sparc-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-SPARC-SAME: "-L[[SYSROOT]]/usr/lib/gcc/sparc-linux-gnu/4.5" -// CHECK-DEBIAN-SPARC-SAME: "-L[[SYSROOT]]/usr/lib/gcc/sparc-linux-gnu/4.5/../../../sparc-linux-gnu" // CHECK-DEBIAN-SPARC-SAME: "-L[[SYSROOT]]/usr/lib/sparc-linux-gnu" // CHECK-DEBIAN-SPARC-SAME: "-L[[SYSROOT]]/usr/lib/gcc/sparc-linux-gnu/4.5/../../.." // CHECK-DEBIAN-SPARC-SAME: "-L[[SYSROOT]]/lib" @@ -1226,7 +1215,6 @@ // CHECK-DEBIAN-SPARC64: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-SPARC64-SAME: "{{.*}}/usr/lib/gcc/sparc64-linux-gnu/4.5{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-SPARC64-SAME: "-L[[SYSROOT]]/usr/lib/gcc/sparc64-linux-gnu/4.5" -// CHECK-DEBIAN-SPARC64-SAME: "-L[[SYSROOT]]/usr/lib/gcc/sparc64-linux-gnu/4.5/../../../sparc64-linux-gnu" // CHECK-DEBIAN-SPARC64-SAME: "-L[[SYSROOT]]/usr/lib/sparc64-linux-gnu" // CHECK-DEBIAN-SPARC64-SAME: "-L[[SYSROOT]]/usr/lib/gcc/sparc64-linux-gnu/4.5/../../.." // CHECK-DEBIAN-SPARC64-SAME: "-L[[SYSROOT]]/lib" @@ -1660,11 +1648,10 @@ // RUN: --sysroot=%S/Inputs/debian_6_mips64_tree \ // RUN: | FileCheck --check-prefix=CHECK-DEBIAN-ML-MIPS64-GNUABI %s // CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/gcc/mips64-linux-gnuabi64/4.9/../../../mips64-linux-gnuabi64{{/|\\\\}}crt1.o" -// CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/gcc/mips64-linux-gnuabi64/4.9/../../../mips64-linux-gnuabi64{{/|\\\\}}crti.o" +// CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/mips64-linux-gnuabi64{{/|\\\\}}crt1.o" +// CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/mips64-linux-gnuabi64{{/|\\\\}}crti.o" // CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/gcc/mips64-linux-gnuabi64/4.9{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-ML-MIPS64-GNUABI: "-L[[SYSROOT]]/usr/lib/gcc/mips64-linux-gnuabi64/4.9" -// CHECK-DEBIAN-ML-MIPS64-GNUABI: "-L[[SYSROOT]]/usr/lib/gcc/mips64-linux-gnuabi64/4.9/../../../mips64-linux-gnuabi64" // CHECK-DEBIAN-ML-MIPS64-GNUABI: "-L[[SYSROOT]]/lib/mips64-linux-gnuabi64" // CHECK-DEBIAN-ML-MIPS64-GNUABI: "-L[[SYSROOT]]/usr/lib/mips64-linux-gnuabi64" // CHECK-DEBIAN-ML-MIPS64-GNUABI: "-L[[SYSROOT]]/usr/lib/gcc/mips64-linux-gnuabi64/4.9" @@ -1672,7 +1659,7 @@ // CHECK-DEBIAN-ML-MIPS64-GNUABI: "-L[[SYSROOT]]/lib" // CHECK-DEBIAN-ML-MIPS64-GNUABI: "-L[[SYSROOT]]/usr/lib" // CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/gcc/mips64-linux-gnuabi64/4.9{{/|\\\\}}crtend.o" -// CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/gcc/mips64-linux-gnuabi64/4.9/../../../mips64-linux-gnuabi64{{/|\\\\}}crtn.o" +// CHECK-DEBIAN-ML-MIPS64-GNUABI: "{{.*}}/usr/lib/mips64-linux-gnuabi64{{/|\\\\}}crtn.o" // // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ // RUN: --target=mips64el-unknown-linux-gnu -rtlib=platform \ @@ -1685,11 +1672,10 @@ // RUN: --sysroot=%S/Inputs/debian_6_mips64_tree \ // RUN: | FileCheck --check-prefix=CHECK-DEBIAN-ML-MIPS64EL-GNUABI %s // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" -// CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}/usr/lib/gcc/mips64el-linux-gnuabi64/4.9/../../../mips64el-linux-gnuabi64{{/|\\\\}}crt1.o" -// CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}/usr/lib/gcc/mips64el-linux-gnuabi64/4.9/../../../mips64el-linux-gnuabi64{{/|\\\\}}crti.o" +// CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}/usr/lib/mips64el-linux-gnuabi64{{/|\\\\}}crt1.o" +// CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}/usr/lib/mips64el-linux-gnuabi64{{/|\\\\}}crti.o" // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}/usr/lib/gcc/mips64el-linux-gnuabi64/4.9{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "-L[[SYSROOT]]/usr/lib/gcc/mips64el-linux-gnuabi64/4.9" -// CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "-L[[SYSROOT]]/usr/lib/gcc/mips64el-linux-gnuabi64/4.9/../../../mips64el-linux-gnuabi64" // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "-L[[SYSROOT]]/lib/mips64el-linux-gnuabi64" // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "-L[[SYSROOT]]/usr/lib/mips64el-linux-gnuabi64" // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "-L[[SYSROOT]]/usr/lib/gcc/mips64el-linux-gnuabi64/4.9" @@ -1697,7 +1683,7 @@ // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "-L[[SYSROOT]]/lib" // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "-L[[SYSROOT]]/usr/lib" // CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}/usr/lib/gcc/mips64el-linux-gnuabi64/4.9{{/|\\\\}}crtend.o" -// CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}/usr/lib/gcc/mips64el-linux-gnuabi64/4.9/../../../mips64el-linux-gnuabi64{{/|\\\\}}crtn.o" +// CHECK-DEBIAN-ML-MIPS64EL-GNUABI: "{{.*}}/usr/lib/mips64el-linux-gnuabi64{{/|\\\\}}crtn.o" // // Test linker invocation for Freescale SDK (OpenEmbedded). // RUN: %clang -no-canonical-prefixes %s -### -o %t.o 2>&1 \ diff --git a/clang/test/Driver/mips-reduced-toolchain.cpp b/clang/test/Driver/mips-reduced-toolchain.cpp index 894bdb5a756b..407295e1426f 100644 --- a/clang/test/Driver/mips-reduced-toolchain.cpp +++ b/clang/test/Driver/mips-reduced-toolchain.cpp @@ -9,7 +9,6 @@ // CHECK-DEBIAN-MIPS: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-MIPS: "{{.*}}/usr/lib/gcc/mips-linux-gnu/4.7{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/usr/lib/gcc/mips-linux-gnu/4.7" -// CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/usr/lib/gcc/mips-linux-gnu/4.7/../../../mips-linux-gnu" // CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/usr/lib/mips-linux-gnu" // CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/usr/lib/gcc/mips-linux-gnu/4.7/../../.." // CHECK-DEBIAN-MIPS: "-L[[SYSROOT]]/lib" @@ -23,7 +22,6 @@ // CHECK-DEBIAN-MIPSEL: "{{.*}}ld{{(.exe)?}}" "--sysroot=[[SYSROOT:[^"]+]]" // CHECK-DEBIAN-MIPSEL: "{{.*}}/usr/lib/gcc/mipsel-linux-gnu/4.7{{/|\\\\}}crtbegin.o" // CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/usr/lib/gcc/mipsel-linux-gnu/4.7" -// CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/usr/lib/gcc/mipsel-linux-gnu/4.7/../../../mipsel-linux-gnu" // CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/usr/lib/mipsel-linux-gnu" // CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/usr/lib/gcc/mipsel-linux-gnu/4.7/../../.." // CHECK-DEBIAN-MIPSEL: "-L[[SYSROOT]]/lib" -- GitLab From 775a294820caefdce4e60603eaac0a071dd72765 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 18:56:40 -0700 Subject: [PATCH 0273/1000] [Driver] Gnu.cpp: remove unneeded -L detection for libc++ If clang is installed in the system, the other -L suffice; otherwise $ccc_install_dir/../lib below suffices. --- clang/lib/Driver/ToolChains/Linux.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index ad98013dd4f0..6599f46d0d52 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -307,16 +307,6 @@ Linux::Linux(const Driver &D, const llvm::Triple &Triple, const ArgList &Args) Generic_GCC::AddMultilibPaths(D, SysRoot, OSLibDir, MultiarchTriple, Paths); - // Similar to the logic for GCC above, if we currently running Clang inside - // of the requested system root, add its parent library paths to - // those searched. - // FIXME: It's not clear whether we should use the driver's installed - // directory ('Dir' below) or the ResourceDir. - if (StringRef(D.Dir).startswith(SysRoot)) { - addPathIfExists(D, D.Dir + "/../lib/" + MultiarchTriple, Paths); - addPathIfExists(D, D.Dir + "/../" + OSLibDir, Paths); - } - addPathIfExists(D, SysRoot + "/lib/" + MultiarchTriple, Paths); addPathIfExists(D, SysRoot + "/lib/../" + OSLibDir, Paths); -- GitLab From 0ad0c476efdbc6b8e933edc32e6f943ed3a33b0d Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 20:12:45 -0700 Subject: [PATCH 0274/1000] [Driver] Gnu.cpp: remove unneeded -L detection hack for -mx32 Removing the hack actually improves our compatibility with gcc -mx32. --- clang/lib/Driver/ToolChains/Gnu.cpp | 5 ----- clang/test/Driver/linux-ld.c | 11 +++++------ 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index f9df2370266c..bd14bbb63f3a 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2855,11 +2855,6 @@ void Generic_GCC::AddMultiarchPaths(const Driver &D, // Try walking via the GCC triple path in case of biarch or multiarch GCC // installations with strange symlinks. if (GCCInstallation.isValid()) { - addPathIfExists(D, - SysRoot + "/usr/lib/" + GCCInstallation.getTriple().str() + - "/../../" + OSLibDir, - Paths); - // Add the 'other' biarch variant path Multilib BiarchSibling; if (GCCInstallation.getBiarchSibling(BiarchSibling)) { diff --git a/clang/test/Driver/linux-ld.c b/clang/test/Driver/linux-ld.c index a07b289540ec..93202da3c083 100644 --- a/clang/test/Driver/linux-ld.c +++ b/clang/test/Driver/linux-ld.c @@ -649,12 +649,11 @@ // CHECK-UBUNTU-14-04-X32: "{{.*}}/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../libx32{{/|\\\\}}crti.o" // CHECK-UBUNTU-14-04-X32: "{{.*}}/usr/lib/gcc/x86_64-linux-gnu/4.8/x32{{/|\\\\}}crtbegin.o" // CHECK-UBUNTU-14-04-X32: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.8/x32" -// CHECK-UBUNTU-14-04-X32: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../libx32" -// CHECK-UBUNTU-14-04-X32: "-L[[SYSROOT]]/lib/../libx32" -// CHECK-UBUNTU-14-04-X32: "-L[[SYSROOT]]/usr/lib/../libx32" -// CHECK-UBUNTU-14-04-X32: "-L[[SYSROOT]]/usr/lib/x86_64-linux-gnu/../../libx32" -// CHECK-UBUNTU-14-04-X32: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.8" -// CHECK-UBUNTU-14-04-X32: "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.." +// CHECK-UBUNTU-14-04-X32-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../libx32" +// CHECK-UBUNTU-14-04-X32-SAME: {{^}} "-L[[SYSROOT]]/lib/../libx32" +// CHECK-UBUNTU-14-04-X32-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/../libx32" +// CHECK-UBUNTU-14-04-X32-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.8" +// CHECK-UBUNTU-14-04-X32-SAME: {{^}} "-L[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.." // CHECK-UBUNTU-14-04-X32: "{{.*}}/usr/lib/gcc/x86_64-linux-gnu/4.8/x32{{/|\\\\}}crtend.o" // CHECK-UBUNTU-14-04-X32: "{{.*}}/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../libx32{{/|\\\\}}crtn.o" // -- GitLab From 56700e937903969a4a95f68c59e38e35daaaa1ea Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 21:32:55 -0700 Subject: [PATCH 0275/1000] [Driver] Gnu.cpp: drop an unneeded special rule related to sysroot Seem unnecessary to diverge from GCC here. Beside, lib/../$OSLibDir can be considered closer to the GCC installation then the system root. The comment should not apply. --- clang/lib/Driver/ToolChains/Gnu.cpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index bd14bbb63f3a..39be77463544 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2834,17 +2834,7 @@ void Generic_GCC::AddMultilibPaths(const Driver &D, SelectedMultilib.osSuffix(), Paths); - // If the GCC installation we found is inside of the sysroot, we want to - // prefer libraries installed in the parent prefix of the GCC installation. - // It is important to *not* use these paths when the GCC installation is - // outside of the system root as that can pick up unintended libraries. - // This usually happens when there is an external cross compiler on the - // host system, and a more minimal sysroot available that is the target of - // the cross. Note that GCC does include some of these directories in some - // configurations but this seems somewhere between questionable and simply - // a bug. - if (StringRef(LibPath).startswith(SysRoot)) - addPathIfExists(D, LibPath + "/../" + OSLibDir, Paths); + addPathIfExists(D, LibPath + "/../" + OSLibDir, Paths); } } -- GitLab From c2f9086b6184a132ec8cac7edeb620813796e1e8 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sat, 20 Mar 2021 21:37:49 -0700 Subject: [PATCH 0276/1000] [Driver] Gnu.cpp: drop an unneeded special rule related to sysroot --- clang/lib/Driver/ToolChains/Gnu.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 39be77463544..078579669634 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2853,8 +2853,6 @@ void Generic_GCC::AddMultiarchPaths(const Driver &D, Paths); } - // See comments above on the multilib variant for details of why this is - // included even from outside the sysroot. const std::string &LibPath = std::string(GCCInstallation.getParentLibPath()); const llvm::Triple &GCCTriple = GCCInstallation.getTriple(); @@ -2862,11 +2860,7 @@ void Generic_GCC::AddMultiarchPaths(const Driver &D, addPathIfExists( D, LibPath + "/../" + GCCTriple.str() + "/lib" + Multilib.osSuffix(), Paths); - - // See comments above on the multilib variant for details of why this is - // only included from within the sysroot. - if (StringRef(LibPath).startswith(SysRoot)) - addPathIfExists(D, LibPath, Paths); + addPathIfExists(D, LibPath, Paths); } } -- GitLab From 2288a75d9eceeabdffcd72789d97386ee10962fb Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Sun, 21 Mar 2021 00:56:03 -0700 Subject: [PATCH 0277/1000] [Driver] Linux.cpp: add -internal-isystem lib/../$triple/include With this change, for `#include `, `clang --target=aarch64-linux-gnu` will read `/usr/lib/gcc/aarch64-linux-gnu/10/../../../../aarch64-linux-gnu/include/ar.h` (on Debian gcc->gcc-cross) instead of `/usr/include/ar.h`. Some glibc headers (e.g. gnu/stubs.h) are different across architectures. --- clang/lib/Driver/ToolChains/Gnu.cpp | 20 +++++++++++++------- clang/lib/Driver/ToolChains/Linux.cpp | 12 ++++++------ clang/test/Driver/linux-header-search.cpp | 6 ++++-- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp index 078579669634..b5efa587f8dc 100644 --- a/clang/lib/Driver/ToolChains/Gnu.cpp +++ b/clang/lib/Driver/ToolChains/Gnu.cpp @@ -2867,13 +2867,19 @@ void Generic_GCC::AddMultiarchPaths(const Driver &D, void Generic_GCC::AddMultilibIncludeArgs(const ArgList &DriverArgs, ArgStringList &CC1Args) const { // Add include directories specific to the selected multilib set and multilib. - if (GCCInstallation.isValid()) { - const auto &Callback = Multilibs.includeDirsCallback(); - if (Callback) { - for (const auto &Path : Callback(GCCInstallation.getMultilib())) - addExternCSystemIncludeIfExists( - DriverArgs, CC1Args, GCCInstallation.getInstallPath() + Path); - } + if (!GCCInstallation.isValid()) + return; + // gcc TOOL_INCLUDE_DIR. + const llvm::Triple &GCCTriple = GCCInstallation.getTriple(); + std::string LibPath(GCCInstallation.getParentLibPath()); + addSystemInclude(DriverArgs, CC1Args, + Twine(LibPath) + "/../" + GCCTriple.str() + "/include"); + + const auto &Callback = Multilibs.includeDirsCallback(); + if (Callback) { + for (const auto &Path : Callback(GCCInstallation.getMultilib())) + addExternCSystemIncludeIfExists(DriverArgs, CC1Args, + GCCInstallation.getInstallPath() + Path); } } diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp index 6599f46d0d52..0e8da0fea2a7 100644 --- a/clang/lib/Driver/ToolChains/Linux.cpp +++ b/clang/lib/Driver/ToolChains/Linux.cpp @@ -574,9 +574,14 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs, if (DriverArgs.hasArg(clang::driver::options::OPT_nostdinc)) return; - if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) + if (!DriverArgs.hasArg(options::OPT_nostdlibinc)) { + // LOCAL_INCLUDE_DIR addSystemInclude(DriverArgs, CC1Args, SysRoot + "/usr/local/include"); + // TOOL_INCLUDE_DIR + AddMultilibIncludeArgs(DriverArgs, CC1Args); + } + // Note: in gcc, GCC_INCLUDE_DIR (private headers) precedes LOCAL_INCLUDE_DIR. SmallString<128> ResourceDirInclude(D.ResourceDir); llvm::sys::path::append(ResourceDirInclude, "include"); if (!DriverArgs.hasArg(options::OPT_nobuiltininc) && @@ -599,11 +604,6 @@ void Linux::AddClangSystemIncludeArgs(const ArgList &DriverArgs, return; } - // Lacking those, try to detect the correct set of system includes for the - // target triple. - - AddMultilibIncludeArgs(DriverArgs, CC1Args); - // Implement generic Debian multiarch support. const StringRef X86_64MultiarchIncludeDirs[] = { "/usr/include/x86_64-linux-gnu", diff --git a/clang/test/Driver/linux-header-search.cpp b/clang/test/Driver/linux-header-search.cpp index 4aed02f9c15d..3560bd009277 100644 --- a/clang/test/Driver/linux-header-search.cpp +++ b/clang/test/Driver/linux-header-search.cpp @@ -188,7 +188,8 @@ // CHECK-DEBIAN-X86: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/i686-linux-gnu/4.5/../../../../include/c++/4.5/i686-linux-gnu" // CHECK-DEBIAN-X86: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/i686-linux-gnu/4.5/../../../../include/c++/4.5/backward" // CHECK-DEBIAN-X86: "-internal-isystem" "[[SYSROOT]]/usr/local/include" -// CHECK-DEBIAN-X86: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include" +// CHECK-DEBIAN-X86-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/i686-linux-gnu/4.5/../../../../i686-linux-gnu/include" +// CHECK-DEBIAN-X86-SAME: {{^}} "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include" // CHECK-DEBIAN-X86: "-internal-externc-isystem" "[[SYSROOT]]/usr/include/i386-linux-gnu" // CHECK-DEBIAN-X86: "-internal-externc-isystem" "[[SYSROOT]]/include" // CHECK-DEBIAN-X86: "-internal-externc-isystem" "[[SYSROOT]]/usr/include" @@ -204,7 +205,8 @@ // CHECK-DEBIAN-X86-64: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.5/../../../../include/c++/4.5/x86_64-linux-gnu" // CHECK-DEBIAN-X86-64: "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.5/../../../../include/c++/4.5/backward" // CHECK-DEBIAN-X86-64: "-internal-isystem" "[[SYSROOT]]/usr/local/include" -// CHECK-DEBIAN-X86-64: "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include" +// CHECK-DEBIAN-X86-64-SAME: {{^}} "-internal-isystem" "[[SYSROOT]]/usr/lib/gcc/x86_64-linux-gnu/4.5/../../../../x86_64-linux-gnu/include" +// CHECK-DEBIAN-X86-64-SAME: {{^}} "-internal-isystem" "[[RESOURCE_DIR]]{{/|\\\\}}include" // CHECK-DEBIAN-X86-64: "-internal-externc-isystem" "[[SYSROOT]]/usr/include/x86_64-linux-gnu" // CHECK-DEBIAN-X86-64: "-internal-externc-isystem" "[[SYSROOT]]/include" // CHECK-DEBIAN-X86-64: "-internal-externc-isystem" "[[SYSROOT]]/usr/include" -- GitLab From 54a05f2ec8da4ac6e02d99e4e2afc24790d6880a Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 21 Mar 2021 09:57:20 +0000 Subject: [PATCH 0278/1000] [X86] computeKnownBitsForTargetNode - add X86ISD::PMULUDQ handling Reuse the existing KnownBits multiplication code to handle what is effectively a ISD::UMUL_LOHI varient --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +++++++++ llvm/test/CodeGen/X86/shrink_vmul.ll | 28 +++++++++---------------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index ea61af073d93..4b1cd7c26338 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34330,6 +34330,16 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero.setBitsFrom(16); break; } + case X86ISD::PMULUDQ: { + KnownBits Known2; + Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); + Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); + + Known = Known.trunc(BitWidth / 2).zext(BitWidth); + Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth); + Known = KnownBits::computeForMul(Known, Known2); + break; + } case X86ISD::CMOV: { Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); // If we don't know any bits, early out. diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index e6660f9e4957..ce3a17e3e986 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -1946,12 +1946,9 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pxor %xmm1, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-SSE-NEXT: psrld $16, %xmm0 ; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: psllq $32, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -1970,12 +1967,9 @@ define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pxor %xmm1, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-SSE-NEXT: psrld $16, %xmm0 ; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: psllq $32, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -2012,12 +2006,11 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE-NEXT: movl c, %edx ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X86-SSE-NEXT: psrad $16, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 -; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm1 -; X86-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X86-SSE-NEXT: psllq $32, %xmm0 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -2036,12 +2029,11 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { ; X64-SSE: # %bb.0: # %entry ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; X64-SSE-NEXT: psrad $16, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] ; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm0 -; X64-SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 -; X64-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-SSE-NEXT: psllq $32, %xmm0 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; -- GitLab From 613157dd67ddddc1bbb1e87236efd389358b281b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 21 Mar 2021 10:16:55 +0000 Subject: [PATCH 0279/1000] [X86] Add PR49658 test case --- llvm/test/CodeGen/X86/combine-pmuldq.ll | 123 ++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 27823cf5fe8c..86bae1899b9a 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -287,3 +287,126 @@ entry: ret i32 %call } declare dso_local i32 @foo(i32, i32, i32, i32) + +define <8 x i32> @PR49658(i32* %ptr, i32 %mul) { +; SSE-LABEL: PR49658: +; SSE: # %bb.0: # %start +; SSE-NEXT: movl %esi, %eax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: .p2align 4, 0x90 +; SSE-NEXT: .LBB7_1: # %loop +; SSE-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-NEXT: pmovzxdq {{.*#+}} xmm3 = mem[0],zero,mem[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm4 = mem[0],zero,mem[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm5 = mem[0],zero,mem[1],zero +; SSE-NEXT: pmovzxdq {{.*#+}} xmm6 = mem[0],zero,mem[1],zero +; SSE-NEXT: pmuludq %xmm2, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm6[1,3] +; SSE-NEXT: paddd %xmm5, %xmm0 +; SSE-NEXT: pmuludq %xmm2, %xmm4 +; SSE-NEXT: pmuludq %xmm2, %xmm3 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] +; SSE-NEXT: paddd %xmm4, %xmm1 +; SSE-NEXT: subq $-128, %rax +; SSE-NEXT: jne .LBB7_1 +; SSE-NEXT: # %bb.2: # %end +; SSE-NEXT: retq +; +; AVX2-LABEL: PR49658: +; AVX2: # %bb.0: # %start +; AVX2-NEXT: movl %esi, %eax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB7_1: # %loop +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 +; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 +; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm5 +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm3 +; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7] +; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: subq $-128, %rax +; AVX2-NEXT: jne .LBB7_1 +; AVX2-NEXT: # %bb.2: # %end +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: PR49658: +; AVX512VL: # %bb.0: # %start +; AVX512VL-NEXT: movl %esi, %eax +; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm2 +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB7_1: # %loop +; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 +; AVX512VL-NEXT: vpmuludq %zmm3, %zmm2, %zmm3 +; AVX512VL-NEXT: vpsllq $32, %zmm3, %zmm3 +; AVX512VL-NEXT: vpaddq %zmm3, %zmm4, %zmm3 +; AVX512VL-NEXT: vpsrlq $32, %zmm3, %zmm3 +; AVX512VL-NEXT: vpmovqd %zmm3, %ymm3 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: subq $-128, %rax +; AVX512VL-NEXT: jne .LBB7_1 +; AVX512VL-NEXT: # %bb.2: # %end +; AVX512VL-NEXT: retq +; +; AVX512DQVL-LABEL: PR49658: +; AVX512DQVL: # %bb.0: # %start +; AVX512DQVL-NEXT: movl %esi, %eax +; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 +; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512DQVL-NEXT: .p2align 4, 0x90 +; AVX512DQVL-NEXT: .LBB7_1: # %loop +; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512DQVL-NEXT: vpmullq %zmm2, %zmm1, %zmm2 +; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 +; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512DQVL-NEXT: subq $-128, %rax +; AVX512DQVL-NEXT: jne .LBB7_1 +; AVX512DQVL-NEXT: # %bb.2: # %end +; AVX512DQVL-NEXT: retq +start: + %t1 = zext i32 %mul to i64 + %t2 = insertelement <8 x i64> undef, i64 %t1, i32 0 + %mulvec = shufflevector <8 x i64> %t2, <8 x i64> undef, <8 x i32> zeroinitializer + br label %loop +loop: + %loopcnt = phi i64 [ 0, %start ], [ %nextcnt, %loop ] + %sum = phi <8 x i32> [ zeroinitializer, %start ], [ %nextsum, %loop ] + %ptroff = getelementptr inbounds i32, i32* %ptr, i64 %loopcnt + %vptroff = bitcast i32* %ptroff to <8 x i32>* + %v = load <8 x i32>, <8 x i32>* %vptroff, align 4 + %v64 = zext <8 x i32> %v to <8 x i64> + %vmul = mul nuw <8 x i64> %mulvec, %v64 + %vmulhi = lshr <8 x i64> %vmul, + %vtrunc = trunc <8 x i64> %vmulhi to <8 x i32> + %nextsum = add <8 x i32> %vtrunc, %sum + %nextcnt = add i64 %loopcnt, 32 + %isdone = icmp eq i64 %nextcnt, 524288 + br i1 %isdone, label %end, label %loop +end: + ret <8 x i32> %nextsum +} -- GitLab From 297b9bc3fade62b05839b17d970eb48cf10623a3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 21 Mar 2021 10:40:57 +0000 Subject: [PATCH 0280/1000] [X86][AVX] computeKnownBitsForTargetNode - add X86ISD::VBROADCAST handling for scalar sources The target shuffle code handles vector sources, but X86ISD::VBROADCAST can also accept a scalar source for splatting. Suggested by @craig.topper on PR49658 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 8 ++++++ llvm/test/CodeGen/X86/combine-pmuldq.ll | 37 +++++++++---------------- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4b1cd7c26338..c6af291f24d9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34303,6 +34303,14 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = Known.trunc(BitWidth); break; } + case X86ISD::VBROADCAST: { + SDValue Src = Op.getOperand(0); + if (!Src.getSimpleValueType().isVector()) { + Known = DAG.computeKnownBits(Src, Depth + 1); + return; + } + break; + } case X86ISD::ANDNP: { KnownBits Known2; Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 86bae1899b9a..63e3c48e3520 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -324,24 +324,17 @@ define <8 x i32> @PR49658(i32* %ptr, i32 %mul) { ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm2 ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB7_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm5 -; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm5 -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 -; AVX2-NEXT: vpaddq %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7] -; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7] +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: subq $-128, %rax ; AVX2-NEXT: jne .LBB7_1 ; AVX2-NEXT: # %bb.2: # %end @@ -353,18 +346,14 @@ define <8 x i32> @PR49658(i32* %ptr, i32 %mul) { ; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 -; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm2 ; AVX512VL-NEXT: .p2align 4, 0x90 ; AVX512VL-NEXT: .LBB7_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm4 -; AVX512VL-NEXT: vpmuludq %zmm3, %zmm2, %zmm3 -; AVX512VL-NEXT: vpsllq $32, %zmm3, %zmm3 -; AVX512VL-NEXT: vpaddq %zmm3, %zmm4, %zmm3 -; AVX512VL-NEXT: vpsrlq $32, %zmm3, %zmm3 -; AVX512VL-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512VL-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512VL-NEXT: vpmuludq %zmm2, %zmm1, %zmm2 +; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: subq $-128, %rax ; AVX512VL-NEXT: jne .LBB7_1 ; AVX512VL-NEXT: # %bb.2: # %end @@ -380,7 +369,7 @@ define <8 x i32> @PR49658(i32* %ptr, i32 %mul) { ; AVX512DQVL-NEXT: .LBB7_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX512DQVL-NEXT: vpmullq %zmm2, %zmm1, %zmm2 +; AVX512DQVL-NEXT: vpmuludq %zmm2, %zmm1, %zmm2 ; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 ; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -- GitLab From 02ffbac844e01df2c95dfcb3117213211fe2226d Mon Sep 17 00:00:00 2001 From: luxufan <932494295@qq.com> Date: Fri, 19 Mar 2021 17:02:28 +0800 Subject: [PATCH 0281/1000] [RISCV] remove redundant instruction when eliminate frame index The reason for generating mv a0, a0 instruction is when the stack object offset is large then int<12>. To deal this situation, in the elimintateFrameIndex function, it will create a virtual register, which needs the register scavenger to scavenge it. If the machine instruction that contains the stack object and the opcode is ADDI(the addi was generated by frameindexNode), and then this instruction's destination register was the same as the register that was generated by the register scavenger, then the mv a0, a0 was generated. So to eliminnate this instruction, in the eliminateFrameIndex function, if the instrution opcode is ADDI, then the virtual register can't be created. Differential Revision: https://reviews.llvm.org/D92479 --- llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp | 7 +++++++ llvm/test/CodeGen/RISCV/large-stack.ll | 1 - llvm/test/CodeGen/RISCV/stack-realignment.ll | 4 ---- llvm/test/CodeGen/RISCV/vararg.ll | 3 --- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp index ad6d3af21d58..7428f1019236 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp @@ -213,6 +213,13 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // Modify Offset and FrameReg appropriately Register ScratchReg = MRI.createVirtualRegister(&RISCV::GPRRegClass); TII->movImm(MBB, II, DL, ScratchReg, Offset.getFixed()); + if (MI.getOpcode() == RISCV::ADDI) { + BuildMI(MBB, II, DL, TII->get(RISCV::ADD), MI.getOperand(0).getReg()) + .addReg(FrameReg) + .addReg(ScratchReg, RegState::Kill); + MI.eraseFromParent(); + return; + } BuildMI(MBB, II, DL, TII->get(RISCV::ADD), ScratchReg) .addReg(FrameReg) .addReg(ScratchReg, RegState::Kill); diff --git a/llvm/test/CodeGen/RISCV/large-stack.ll b/llvm/test/CodeGen/RISCV/large-stack.ll index e4cf5eb28399..962d88907ee0 100644 --- a/llvm/test/CodeGen/RISCV/large-stack.ll +++ b/llvm/test/CodeGen/RISCV/large-stack.ll @@ -101,7 +101,6 @@ define void @test_emergency_spill_slot(i32 %a) { ; RV32I-WITHFP-NEXT: lui a2, 1048478 ; RV32I-WITHFP-NEXT: addi a2, a2, 1388 ; RV32I-WITHFP-NEXT: add a2, s0, a2 -; RV32I-WITHFP-NEXT: mv a2, a2 ; RV32I-WITHFP-NEXT: add a1, a2, a1 ; RV32I-WITHFP-NEXT: #APP ; RV32I-WITHFP-NEXT: nop diff --git a/llvm/test/CodeGen/RISCV/stack-realignment.ll b/llvm/test/CodeGen/RISCV/stack-realignment.ll index 6f72a2488c27..681ed762346c 100644 --- a/llvm/test/CodeGen/RISCV/stack-realignment.ll +++ b/llvm/test/CodeGen/RISCV/stack-realignment.ll @@ -460,7 +460,6 @@ define void @caller2048() { ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: addi a0, a0, -2048 ; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: mv a0, a0 ; RV32I-NEXT: call callee@plt ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: sub sp, s0, a0 @@ -489,7 +488,6 @@ define void @caller2048() { ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: addiw a0, a0, -2048 ; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: mv a0, a0 ; RV64I-NEXT: call callee@plt ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: sub sp, s0, a0 @@ -552,7 +550,6 @@ define void @caller4096() { ; RV32I-NEXT: slli sp, a0, 12 ; RV32I-NEXT: lui a0, 1 ; RV32I-NEXT: add a0, sp, a0 -; RV32I-NEXT: mv a0, a0 ; RV32I-NEXT: call callee@plt ; RV32I-NEXT: lui a0, 2 ; RV32I-NEXT: sub sp, s0, a0 @@ -581,7 +578,6 @@ define void @caller4096() { ; RV64I-NEXT: slli sp, a0, 12 ; RV64I-NEXT: lui a0, 1 ; RV64I-NEXT: add a0, sp, a0 -; RV64I-NEXT: mv a0, a0 ; RV64I-NEXT: call callee@plt ; RV64I-NEXT: lui a0, 2 ; RV64I-NEXT: sub sp, s0, a0 diff --git a/llvm/test/CodeGen/RISCV/vararg.ll b/llvm/test/CodeGen/RISCV/vararg.ll index 0f2001b2746a..7efa1a372603 100644 --- a/llvm/test/CodeGen/RISCV/vararg.ll +++ b/llvm/test/CodeGen/RISCV/vararg.ll @@ -1773,7 +1773,6 @@ define i32 @va_large_stack(i8* %fmt, ...) { ; ILP32-ILP32F-FPELIM-NEXT: lui a1, 24414 ; ILP32-ILP32F-FPELIM-NEXT: addi a1, a1, 280 ; ILP32-ILP32F-FPELIM-NEXT: add a1, sp, a1 -; ILP32-ILP32F-FPELIM-NEXT: mv a1, a1 ; ILP32-ILP32F-FPELIM-NEXT: sw a1, 12(sp) ; ILP32-ILP32F-FPELIM-NEXT: lui a1, 24414 ; ILP32-ILP32F-FPELIM-NEXT: addi a1, a1, 304 @@ -1852,7 +1851,6 @@ define i32 @va_large_stack(i8* %fmt, ...) { ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a1, 24414 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a1, a1, 280 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: add a1, sp, a1 -; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: mv a1, a1 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: sw a1, 12(sp) ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: lui a1, 24414 ; RV32D-ILP32-ILP32F-ILP32D-FPELIM-NEXT: addi a1, a1, 304 @@ -1896,7 +1894,6 @@ define i32 @va_large_stack(i8* %fmt, ...) { ; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 ; LP64-LP64F-LP64D-FPELIM-NEXT: addiw a0, a0, 284 ; LP64-LP64F-LP64D-FPELIM-NEXT: add a0, sp, a0 -; LP64-LP64F-LP64D-FPELIM-NEXT: mv a0, a0 ; LP64-LP64F-LP64D-FPELIM-NEXT: sd a0, 8(sp) ; LP64-LP64F-LP64D-FPELIM-NEXT: lui a0, 24414 ; LP64-LP64F-LP64D-FPELIM-NEXT: addiw a0, a0, 280 -- GitLab From 6d9d2049c8532457e86a48f602a7e5d5ed2828d3 Mon Sep 17 00:00:00 2001 From: David Green Date: Sun, 21 Mar 2021 12:00:06 +0000 Subject: [PATCH 0282/1000] [ARM] VINS f16 pattern This adds an extra pattern for inserting an f16 into a odd vector lane via an VINS. If the dual-insert-lane pattern does not happen to apply, this can help with some simple cases. Differential Revision: https://reviews.llvm.org/D95471 --- llvm/lib/Target/ARM/ARMInstrMVE.td | 7 +- .../CodeGen/Thumb2/mve-float16regloops.ll | 16 +- llvm/test/CodeGen/Thumb2/mve-masked-load.ll | 24 +- llvm/test/CodeGen/Thumb2/mve-shuffle.ll | 3 +- llvm/test/CodeGen/Thumb2/mve-vst3.ll | 252 +++++++++--------- 5 files changed, 146 insertions(+), 156 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 7d1c9017e3dc..c4830e7351f5 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1900,8 +1900,13 @@ let Predicates = [HasMVEInt] in { def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane), (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>; - def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm:$lane), + def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_even:$lane), (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>; + def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm_odd:$lane), + (COPY_TO_REGCLASS (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), + (VINSH (EXTRACT_SUBREG MQPR:$src1, (SSubReg_f16_reg imm_odd:$lane)), + (COPY_TO_REGCLASS HPR:$src2, SPR)), + (SSubReg_f16_reg imm_odd:$lane)), MQPR)>; def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane), (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>; def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane), diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index 6b053b8fd104..dd8c4f110691 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -1468,19 +1468,19 @@ define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instanc ; CHECK-NEXT: @ Parent Loop BB17_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: ldrh r7, [r1], #4 -; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vfma.f16 q2, q3, r7 -; CHECK-NEXT: ldrh r3, [r1, #-2] +; CHECK-NEXT: ldrh r4, [r1, #-2] ; CHECK-NEXT: vmov.u16 r7, q2[0] ; CHECK-NEXT: vfma.f16 q2, q4, r7 -; CHECK-NEXT: vmov.16 q2[3], r4 -; CHECK-NEXT: vfma.f16 q2, q5, r3 -; CHECK-NEXT: vmov.u16 r3, q2[1] -; CHECK-NEXT: vfma.f16 q2, q6, r3 -; CHECK-NEXT: strh r3, [r5, #2] +; CHECK-NEXT: vins.f16 s9, s4 +; CHECK-NEXT: vfma.f16 q2, q5, r4 +; CHECK-NEXT: vmov.u16 r4, q2[1] +; CHECK-NEXT: vfma.f16 q2, q6, r4 +; CHECK-NEXT: strh r4, [r5, #2] ; CHECK-NEXT: vmov.f32 s8, s9 ; CHECK-NEXT: strh r7, [r5], #4 -; CHECK-NEXT: vmov.16 q2[2], r4 +; CHECK-NEXT: vmov.16 q2[2], r3 ; CHECK-NEXT: le lr, .LBB17_5 ; CHECK-NEXT: .LBB17_6: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1 diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll index 9d5e3412946a..02895b0a214c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -1500,8 +1500,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-LE-NEXT: ldrh r2, [r0, #2] ; CHECK-LE-NEXT: strh.w r2, [sp, #24] ; CHECK-LE-NEXT: vldr.16 s4, [sp, #24] -; CHECK-LE-NEXT: vmov r2, s4 -; CHECK-LE-NEXT: vmov.16 q0[1], r2 +; CHECK-LE-NEXT: vins.f16 s0, s4 ; CHECK-LE-NEXT: lsls r2, r1, #29 ; CHECK-LE-NEXT: bpl .LBB45_3 ; CHECK-LE-NEXT: .LBB45_11: @ %cond.load4 @@ -1516,8 +1515,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-LE-NEXT: ldrh r2, [r0, #6] ; CHECK-LE-NEXT: strh.w r2, [sp, #16] ; CHECK-LE-NEXT: vldr.16 s4, [sp, #16] -; CHECK-LE-NEXT: vmov r2, s4 -; CHECK-LE-NEXT: vmov.16 q0[3], r2 +; CHECK-LE-NEXT: vins.f16 s1, s4 ; CHECK-LE-NEXT: lsls r2, r1, #27 ; CHECK-LE-NEXT: bpl .LBB45_5 ; CHECK-LE-NEXT: .LBB45_13: @ %cond.load10 @@ -1532,8 +1530,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-LE-NEXT: ldrh r2, [r0, #10] ; CHECK-LE-NEXT: strh.w r2, [sp, #8] ; CHECK-LE-NEXT: vldr.16 s4, [sp, #8] -; CHECK-LE-NEXT: vmov r2, s4 -; CHECK-LE-NEXT: vmov.16 q0[5], r2 +; CHECK-LE-NEXT: vins.f16 s2, s4 ; CHECK-LE-NEXT: lsls r2, r1, #25 ; CHECK-LE-NEXT: bpl .LBB45_7 ; CHECK-LE-NEXT: .LBB45_15: @ %cond.load16 @@ -1548,8 +1545,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-LE-NEXT: ldrh r0, [r0, #14] ; CHECK-LE-NEXT: strh.w r0, [sp] ; CHECK-LE-NEXT: vldr.16 s4, [sp] -; CHECK-LE-NEXT: vmov r0, s4 -; CHECK-LE-NEXT: vmov.16 q0[7], r0 +; CHECK-LE-NEXT: vins.f16 s3, s4 ; CHECK-LE-NEXT: add sp, #40 ; CHECK-LE-NEXT: bx lr ; @@ -1614,8 +1610,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-BE-NEXT: ldrh r0, [r0, #14] ; CHECK-BE-NEXT: strh.w r0, [sp] ; CHECK-BE-NEXT: vldr.16 s0, [sp] -; CHECK-BE-NEXT: vmov r0, s0 -; CHECK-BE-NEXT: vmov.16 q1[7], r0 +; CHECK-BE-NEXT: vins.f16 s7, s0 ; CHECK-BE-NEXT: .LBB45_9: @ %else20 ; CHECK-BE-NEXT: vrev64.16 q0, q1 ; CHECK-BE-NEXT: add sp, #40 @@ -1630,8 +1625,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-BE-NEXT: ldrh r2, [r0, #2] ; CHECK-BE-NEXT: strh.w r2, [sp, #24] ; CHECK-BE-NEXT: vldr.16 s0, [sp, #24] -; CHECK-BE-NEXT: vmov r2, s0 -; CHECK-BE-NEXT: vmov.16 q1[1], r2 +; CHECK-BE-NEXT: vins.f16 s4, s0 ; CHECK-BE-NEXT: lsls r2, r1, #26 ; CHECK-BE-NEXT: bpl .LBB45_3 ; CHECK-BE-NEXT: .LBB45_12: @ %cond.load4 @@ -1646,8 +1640,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-BE-NEXT: ldrh r2, [r0, #6] ; CHECK-BE-NEXT: strh.w r2, [sp, #16] ; CHECK-BE-NEXT: vldr.16 s0, [sp, #16] -; CHECK-BE-NEXT: vmov r2, s0 -; CHECK-BE-NEXT: vmov.16 q1[3], r2 +; CHECK-BE-NEXT: vins.f16 s5, s0 ; CHECK-BE-NEXT: lsls r2, r1, #28 ; CHECK-BE-NEXT: bpl .LBB45_5 ; CHECK-BE-NEXT: .LBB45_14: @ %cond.load10 @@ -1662,8 +1655,7 @@ define arm_aapcs_vfpcc <8 x half> @masked_v8f16_align1_undef(<8 x half> *%dest, ; CHECK-BE-NEXT: ldrh r2, [r0, #10] ; CHECK-BE-NEXT: strh.w r2, [sp, #8] ; CHECK-BE-NEXT: vldr.16 s0, [sp, #8] -; CHECK-BE-NEXT: vmov r2, s0 -; CHECK-BE-NEXT: vmov.16 q1[5], r2 +; CHECK-BE-NEXT: vins.f16 s6, s0 ; CHECK-BE-NEXT: lsls r2, r1, #30 ; CHECK-BE-NEXT: bpl .LBB45_7 ; CHECK-BE-NEXT: .LBB45_16: @ %cond.load16 diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll index db8f7018ba55..415ce651b5ca 100644 --- a/llvm/test/CodeGen/Thumb2/mve-shuffle.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffle.ll @@ -1319,8 +1319,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @oneoff21_f16(<8 x half> %src1, <8 x half> %src2) { ; CHECK-LABEL: oneoff21_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov.16 q1[3], r0 +; CHECK-NEXT: vins.f16 s5, s0 ; CHECK-NEXT: vmov q0, q1 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vst3.ll b/llvm/test/CodeGen/Thumb2/mve-vst3.ll index c1367ea819a9..f569ddb2de91 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vst3.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vst3.ll @@ -1392,63 +1392,61 @@ define void @vst3_v8f16(<8 x half> *%src, <24 x half> *%dst) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14} -; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q3, [r0] ; CHECK-NEXT: vldrw.u32 q5, [r0, #16] -; CHECK-NEXT: vmov.f64 d0, d4 +; CHECK-NEXT: vmov.f64 d0, d6 ; CHECK-NEXT: vmovx.f16 s6, s20 -; CHECK-NEXT: vmovx.f16 s12, s8 -; CHECK-NEXT: vmovx.f16 s24, s23 -; CHECK-NEXT: vmov.f32 s4, s9 +; CHECK-NEXT: vmovx.f16 s8, s12 +; CHECK-NEXT: vmov.f32 s4, s13 ; CHECK-NEXT: vins.f16 s0, s20 ; CHECK-NEXT: vmov r2, s6 ; CHECK-NEXT: vins.f16 s4, s21 ; CHECK-NEXT: vmov.16 q0[4], r2 ; CHECK-NEXT: vmov.f32 s3, s4 ; CHECK-NEXT: vldrw.u32 q1, [r0, #32] -; CHECK-NEXT: vmov.f32 s1, s8 +; CHECK-NEXT: vmov.f32 s1, s12 ; CHECK-NEXT: vmov.f32 s17, s4 -; CHECK-NEXT: vmovx.f16 s26, s6 +; CHECK-NEXT: vmovx.f16 s24, s7 ; CHECK-NEXT: vmov.f32 s18, s4 -; CHECK-NEXT: vins.f16 s17, s12 -; CHECK-NEXT: vmovx.f16 s12, s18 -; CHECK-NEXT: vins.f16 s2, s12 -; CHECK-NEXT: vmovx.f16 s12, s7 -; CHECK-NEXT: vins.f16 s24, s12 -; CHECK-NEXT: vmovx.f16 s12, s22 -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vins.f16 s12, s26 -; CHECK-NEXT: vmov.16 q3[3], r0 -; CHECK-NEXT: vrev32.16 q5, q5 -; CHECK-NEXT: vmov.f32 s15, s24 -; CHECK-NEXT: vmov.f32 s25, s11 -; CHECK-NEXT: vmov.f32 s14, s7 -; CHECK-NEXT: vmovx.f16 s28, s13 -; CHECK-NEXT: vmov.f32 s26, s11 +; CHECK-NEXT: vins.f16 s17, s8 +; CHECK-NEXT: vmovx.f16 s8, s18 +; CHECK-NEXT: vins.f16 s2, s8 +; CHECK-NEXT: vmovx.f16 s11, s23 +; CHECK-NEXT: vins.f16 s11, s24 +; CHECK-NEXT: vmovx.f16 s24, s6 +; CHECK-NEXT: vmovx.f16 s8, s22 +; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vins.f16 s8, s24 +; CHECK-NEXT: vmov.f32 s25, s15 +; CHECK-NEXT: vins.f16 s9, s23 +; CHECK-NEXT: vmov.f32 s26, s15 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmovx.f16 s28, s9 ; CHECK-NEXT: vins.f16 s25, s28 ; CHECK-NEXT: vmovx.f16 s28, s26 -; CHECK-NEXT: vins.f16 s14, s28 -; CHECK-NEXT: vmovx.f16 s28, s9 +; CHECK-NEXT: vins.f16 s10, s28 +; CHECK-NEXT: vmovx.f16 s28, s13 ; CHECK-NEXT: vmov.f32 s4, s5 +; CHECK-NEXT: vrev32.16 q5, q5 ; CHECK-NEXT: vins.f16 s4, s28 -; CHECK-NEXT: vmovx.f16 s28, s10 +; CHECK-NEXT: vmovx.f16 s28, s14 ; CHECK-NEXT: vins.f16 s6, s28 -; CHECK-NEXT: vmov.f32 s18, s2 +; CHECK-NEXT: vmov.f32 s26, s10 ; CHECK-NEXT: vmov.f32 s7, s6 -; CHECK-NEXT: vmov.f32 s6, s10 -; CHECK-NEXT: vmovx.f16 s8, s5 -; CHECK-NEXT: vins.f16 s21, s8 -; CHECK-NEXT: vmovx.f16 s8, s22 -; CHECK-NEXT: vins.f16 s6, s8 -; CHECK-NEXT: vmov.f32 s26, s14 -; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s6, s14 +; CHECK-NEXT: vmovx.f16 s12, s5 +; CHECK-NEXT: vins.f16 s21, s12 +; CHECK-NEXT: vmovx.f16 s12, s22 +; CHECK-NEXT: vins.f16 s6, s12 ; CHECK-NEXT: vmov.f32 s1, s17 -; CHECK-NEXT: vmov.f32 s13, s25 +; CHECK-NEXT: vmov.f32 s22, s6 +; CHECK-NEXT: vmov.f32 s9, s25 ; CHECK-NEXT: vmov.f32 s5, s21 ; CHECK-NEXT: vmov.f32 s2, s18 +; CHECK-NEXT: vmov.f32 s10, s26 ; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vmov.f32 s14, s26 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] ; CHECK-NEXT: vmov.f32 s6, s22 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vstrw.32 q1, [r1, #16] ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14} ; CHECK-NEXT: bx lr @@ -1473,146 +1471,142 @@ define void @vst3_v16f16(<16 x half> *%src, <48 x half> *%dst) { ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: .pad #128 ; CHECK-NEXT: sub sp, #128 -; CHECK-NEXT: vldrw.u32 q3, [r0, #32] +; CHECK-NEXT: vldrw.u32 q1, [r0, #32] ; CHECK-NEXT: vldrw.u32 q7, [r0, #64] -; CHECK-NEXT: vldrw.u32 q4, [r0] -; CHECK-NEXT: vldrw.u32 q5, [r0, #48] +; CHECK-NEXT: vldrw.u32 q5, [r0, #80] ; CHECK-NEXT: vmovx.f16 s0, s31 -; CHECK-NEXT: vmovx.f16 s2, s15 -; CHECK-NEXT: vins.f16 s2, s0 +; CHECK-NEXT: vmovx.f16 s11, s7 +; CHECK-NEXT: vins.f16 s11, s0 ; CHECK-NEXT: vmovx.f16 s0, s30 -; CHECK-NEXT: vmovx.f16 s4, s14 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vins.f16 s4, s0 -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmov.16 q1[3], r2 -; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.f32 s7, s2 -; CHECK-NEXT: vmovx.f16 s2, s20 -; CHECK-NEXT: vmov.f32 s6, s31 -; CHECK-NEXT: vmovx.f16 s0, s5 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov.f32 s5, s19 -; CHECK-NEXT: vmov.f32 s6, s19 -; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s8, s6 +; CHECK-NEXT: vmov q4, q1 +; CHECK-NEXT: vins.f16 s8, s0 +; CHECK-NEXT: vstrw.32 q4, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s9, s7 +; CHECK-NEXT: vmov.f32 s10, s31 +; CHECK-NEXT: vmovx.f16 s0, s9 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov.f32 s5, s11 +; CHECK-NEXT: vmov q6, q2 +; CHECK-NEXT: vmov.f32 s6, s11 +; CHECK-NEXT: vldrw.u32 q2, [r0, #48] ; CHECK-NEXT: vins.f16 s5, s0 ; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q1, [r0, #16] -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d4, d2 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s2, s12 -; CHECK-NEXT: vins.f16 s8, s20 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmovx.f16 s2, s8 +; CHECK-NEXT: vstrw.32 q3, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.f64 d6, d2 +; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill ; CHECK-NEXT: vmov.f32 s0, s5 -; CHECK-NEXT: vins.f16 s0, s21 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vldrw.u32 q5, [r0, #80] -; CHECK-NEXT: vmov.f32 s11, s0 -; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vins.f16 s12, s8 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vins.f16 s0, s9 +; CHECK-NEXT: vmov.16 q3[4], r2 +; CHECK-NEXT: vmovx.f16 s2, s16 +; CHECK-NEXT: vmov.f32 s15, s0 ; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.f32 s13, s4 ; CHECK-NEXT: vmov.f32 s5, s20 ; CHECK-NEXT: vmov.f32 s6, s20 ; CHECK-NEXT: vins.f16 s5, s0 ; CHECK-NEXT: vmovx.f16 s0, s6 ; CHECK-NEXT: vstrw.32 q1, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmov.f64 d2, d8 -; CHECK-NEXT: vins.f16 s10, s0 -; CHECK-NEXT: vstrw.32 q2, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q6 -; CHECK-NEXT: vmovx.f16 s24, s10 -; CHECK-NEXT: vmov.f32 s0, s17 -; CHECK-NEXT: vins.f16 s4, s12 +; CHECK-NEXT: vmov q1, q6 +; CHECK-NEXT: vins.f16 s14, s0 +; CHECK-NEXT: vmov.f32 s0, s5 +; CHECK-NEXT: vins.f16 s24, s16 ; CHECK-NEXT: vmov r0, s2 -; CHECK-NEXT: vins.f16 s0, s13 -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.f32 s13, s28 -; CHECK-NEXT: vmov.f32 s7, s0 -; CHECK-NEXT: vmovx.f16 s0, s16 -; CHECK-NEXT: vmov.f32 s14, s28 -; CHECK-NEXT: vmovx.f16 s2, s11 -; CHECK-NEXT: vins.f16 s13, s0 -; CHECK-NEXT: vmov.f32 s5, s16 -; CHECK-NEXT: vmovx.f16 s0, s14 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vins.f16 s6, s0 +; CHECK-NEXT: vins.f16 s0, s17 +; CHECK-NEXT: vmov.16 q6[4], r0 +; CHECK-NEXT: vmov.f32 s27, s0 +; CHECK-NEXT: vmovx.f16 s0, s4 +; CHECK-NEXT: vmov.f32 s25, s4 +; CHECK-NEXT: vmov.f32 s5, s28 +; CHECK-NEXT: vmov.f32 s6, s28 +; CHECK-NEXT: vins.f16 s5, s0 +; CHECK-NEXT: vmovx.f16 s0, s6 +; CHECK-NEXT: vstrw.32 q1, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vins.f16 s26, s0 ; CHECK-NEXT: vmovx.f16 s0, s22 -; CHECK-NEXT: vins.f16 s24, s0 -; CHECK-NEXT: vstrw.32 q1, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q2 +; CHECK-NEXT: vmovx.f16 s4, s10 +; CHECK-NEXT: vins.f16 s4, s0 ; CHECK-NEXT: vmovx.f16 s0, s23 -; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s2, s0 -; CHECK-NEXT: vmov.16 q6[3], r0 -; CHECK-NEXT: vrev32.16 q1, q1 -; CHECK-NEXT: vmov.f32 s27, s2 +; CHECK-NEXT: vmovx.f16 s7, s11 +; CHECK-NEXT: vmov.f32 s28, s29 +; CHECK-NEXT: vins.f16 s7, s0 +; CHECK-NEXT: vins.f16 s5, s11 +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s6, s23 +; CHECK-NEXT: vmovx.f16 s16, s5 ; CHECK-NEXT: vmov.f32 s1, s11 -; CHECK-NEXT: vmov.f32 s26, s23 -; CHECK-NEXT: vmovx.f16 s16, s25 ; CHECK-NEXT: vmov.f32 s2, s11 ; CHECK-NEXT: vins.f16 s1, s16 ; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vins.f16 s26, s16 +; CHECK-NEXT: vins.f16 s6, s16 ; CHECK-NEXT: vmovx.f16 s16, s9 ; CHECK-NEXT: vmov.f32 s20, s21 ; CHECK-NEXT: vins.f16 s20, s16 ; CHECK-NEXT: vmovx.f16 s16, s10 ; CHECK-NEXT: vins.f16 s22, s16 -; CHECK-NEXT: vmov.f32 s2, s26 +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s23, s22 -; CHECK-NEXT: vmov.f32 s22, s10 +; CHECK-NEXT: vrev32.16 q2, q2 +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s22, s18 ; CHECK-NEXT: vmovx.f16 s16, s21 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vins.f16 s5, s16 +; CHECK-NEXT: vins.f16 s9, s16 ; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s25, s1 -; CHECK-NEXT: vmov.f32 s18, s10 -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s14 +; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q4, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vmovx.f16 s16, s6 -; CHECK-NEXT: vmov.f32 s14, s10 +; CHECK-NEXT: vldrw.u32 q4, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s26 +; CHECK-NEXT: vstrw.32 q4, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vmovx.f16 s16, s10 ; CHECK-NEXT: vins.f16 s22, s16 ; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s28, s29 +; CHECK-NEXT: vmov.f32 s5, s1 ; CHECK-NEXT: vmovx.f16 s8, s17 -; CHECK-NEXT: vmov.f32 s26, s2 -; CHECK-NEXT: vmov.f32 s5, s13 +; CHECK-NEXT: vmov.f32 s6, s2 ; CHECK-NEXT: vins.f16 s28, s8 ; CHECK-NEXT: vmovx.f16 s0, s18 -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vins.f16 s30, s0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s6, s14 -; CHECK-NEXT: vldrw.u32 q3, [sp, #112] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s31, s30 -; CHECK-NEXT: vrev32.16 q0, q0 +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s30, s18 ; CHECK-NEXT: vmovx.f16 s16, s29 -; CHECK-NEXT: vmov.f32 s9, s13 +; CHECK-NEXT: vrev32.16 q0, q0 +; CHECK-NEXT: vstrw.32 q1, [r1, #80] ; CHECK-NEXT: vins.f16 s1, s16 -; CHECK-NEXT: vmov.f32 s10, s14 ; CHECK-NEXT: vmovx.f16 s16, s2 -; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vins.f16 s30, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s2, s30 -; CHECK-NEXT: vmov.f32 s18, s14 -; CHECK-NEXT: vstrw.32 q2, [r1, #48] -; CHECK-NEXT: vmov.f32 s13, s17 -; CHECK-NEXT: vstrw.32 q6, [r1, #80] +; CHECK-NEXT: vmov.f32 s25, s17 +; CHECK-NEXT: vmov.f32 s26, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #112] @ 16-byte Reload ; CHECK-NEXT: vmov.f32 s29, s1 -; CHECK-NEXT: vstrw.32 q1, [r1] -; CHECK-NEXT: vmov.f32 s30, s2 +; CHECK-NEXT: vstrw.32 q6, [r1] +; CHECK-NEXT: vmov.f32 s13, s17 ; CHECK-NEXT: vmov.f32 s14, s18 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload -; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s10 +; CHECK-NEXT: vstrw.32 q3, [r1, #48] +; CHECK-NEXT: vmov.f32 s9, s17 +; CHECK-NEXT: vmov.f32 s30, s2 ; CHECK-NEXT: vstrw.32 q7, [r1, #16] +; CHECK-NEXT: vmov.f32 s10, s18 +; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vmov.f32 s18, s22 +; CHECK-NEXT: vstrw.32 q2, [r1, #32] ; CHECK-NEXT: vmov.f32 s21, s17 -; CHECK-NEXT: vstrw.32 q3, [r1, #32] ; CHECK-NEXT: vmov.f32 s22, s18 ; CHECK-NEXT: vstrw.32 q5, [r1, #64] ; CHECK-NEXT: add sp, #128 -- GitLab From dc51cc3293c3740b85f22f301a326573132db4ee Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 21 Mar 2021 12:08:53 +0000 Subject: [PATCH 0283/1000] [X86] Add 'mulhs' variant of PR49658 test case --- llvm/test/CodeGen/X86/combine-pmuldq.ll | 176 +++++++++++++++++++++++- 1 file changed, 171 insertions(+), 5 deletions(-) diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 63e3c48e3520..74b953a839f8 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -288,8 +288,8 @@ entry: } declare dso_local i32 @foo(i32, i32, i32, i32) -define <8 x i32> @PR49658(i32* %ptr, i32 %mul) { -; SSE-LABEL: PR49658: +define <8 x i32> @PR49658_zext(i32* %ptr, i32 %mul) { +; SSE-LABEL: PR49658_zext: ; SSE: # %bb.0: # %start ; SSE-NEXT: movl %esi, %eax ; SSE-NEXT: movq %rax, %xmm0 @@ -317,7 +317,7 @@ define <8 x i32> @PR49658(i32* %ptr, i32 %mul) { ; SSE-NEXT: # %bb.2: # %end ; SSE-NEXT: retq ; -; AVX2-LABEL: PR49658: +; AVX2-LABEL: PR49658_zext: ; AVX2: # %bb.0: # %start ; AVX2-NEXT: movl %esi, %eax ; AVX2-NEXT: vmovq %rax, %xmm0 @@ -340,7 +340,7 @@ define <8 x i32> @PR49658(i32* %ptr, i32 %mul) { ; AVX2-NEXT: # %bb.2: # %end ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: PR49658: +; AVX512VL-LABEL: PR49658_zext: ; AVX512VL: # %bb.0: # %start ; AVX512VL-NEXT: movl %esi, %eax ; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 @@ -359,7 +359,7 @@ define <8 x i32> @PR49658(i32* %ptr, i32 %mul) { ; AVX512VL-NEXT: # %bb.2: # %end ; AVX512VL-NEXT: retq ; -; AVX512DQVL-LABEL: PR49658: +; AVX512DQVL-LABEL: PR49658_zext: ; AVX512DQVL: # %bb.0: # %start ; AVX512DQVL-NEXT: movl %esi, %eax ; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 @@ -399,3 +399,169 @@ loop: end: ret <8 x i32> %nextsum } + +define <8 x i32> @PR49658_sext(i32* %ptr, i32 %mul) { +; SSE-LABEL: PR49658_sext: +; SSE: # %bb.0: # %start +; SSE-NEXT: movslq %esi, %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,1,0,1] +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; SSE-NEXT: movdqa %xmm9, %xmm8 +; SSE-NEXT: psrlq $32, %xmm8 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: .p2align 4, 0x90 +; SSE-NEXT: .LBB8_1: # %loop +; SSE-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE-NEXT: pmovsxdq 2097176(%rdi,%rax), %xmm5 +; SSE-NEXT: pmovsxdq 2097168(%rdi,%rax), %xmm4 +; SSE-NEXT: pmovsxdq 2097152(%rdi,%rax), %xmm6 +; SSE-NEXT: pmovsxdq 2097160(%rdi,%rax), %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pmuludq %xmm7, %xmm3 +; SSE-NEXT: movdqa %xmm9, %xmm2 +; SSE-NEXT: pmuludq %xmm7, %xmm2 +; SSE-NEXT: psrlq $32, %xmm7 +; SSE-NEXT: pmuludq %xmm9, %xmm7 +; SSE-NEXT: paddq %xmm3, %xmm7 +; SSE-NEXT: psllq $32, %xmm7 +; SSE-NEXT: paddq %xmm2, %xmm7 +; SSE-NEXT: movdqa %xmm8, %xmm2 +; SSE-NEXT: pmuludq %xmm6, %xmm2 +; SSE-NEXT: movdqa %xmm9, %xmm3 +; SSE-NEXT: pmuludq %xmm6, %xmm3 +; SSE-NEXT: psrlq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm9, %xmm6 +; SSE-NEXT: paddq %xmm2, %xmm6 +; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: paddq %xmm3, %xmm6 +; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,3],xmm7[1,3] +; SSE-NEXT: paddd %xmm6, %xmm0 +; SSE-NEXT: movdqa %xmm4, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm9, %xmm4 +; SSE-NEXT: paddq %xmm3, %xmm4 +; SSE-NEXT: movdqa %xmm5, %xmm2 +; SSE-NEXT: psrlq $32, %xmm2 +; SSE-NEXT: pmuludq %xmm9, %xmm2 +; SSE-NEXT: movdqa %xmm8, %xmm3 +; SSE-NEXT: pmuludq %xmm5, %xmm3 +; SSE-NEXT: paddq %xmm2, %xmm3 +; SSE-NEXT: psllq $32, %xmm3 +; SSE-NEXT: pmuludq %xmm9, %xmm5 +; SSE-NEXT: paddq %xmm3, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm5[1,3] +; SSE-NEXT: paddd %xmm4, %xmm1 +; SSE-NEXT: subq $-128, %rax +; SSE-NEXT: jne .LBB8_1 +; SSE-NEXT: # %bb.2: # %end +; SSE-NEXT: retq +; +; AVX2-LABEL: PR49658_sext: +; AVX2: # %bb.0: # %start +; AVX2-NEXT: movslq %esi, %rax +; AVX2-NEXT: vmovq %rax, %xmm0 +; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm2 +; AVX2-NEXT: .p2align 4, 0x90 +; AVX2-NEXT: .LBB8_1: # %loop +; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm3 +; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm4 +; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5 +; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm6 +; AVX2-NEXT: vpmuludq %ymm6, %ymm1, %ymm6 +; AVX2-NEXT: vpaddq %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 +; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm4 +; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm5 +; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm6 +; AVX2-NEXT: vpmuludq %ymm6, %ymm1, %ymm6 +; AVX2-NEXT: vpaddq %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 +; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpaddq %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7] +; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: subq $-128, %rax +; AVX2-NEXT: jne .LBB8_1 +; AVX2-NEXT: # %bb.2: # %end +; AVX2-NEXT: retq +; +; AVX512VL-LABEL: PR49658_sext: +; AVX512VL: # %bb.0: # %start +; AVX512VL-NEXT: movslq %esi, %rax +; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm2 +; AVX512VL-NEXT: .p2align 4, 0x90 +; AVX512VL-NEXT: .LBB8_1: # %loop +; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512VL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm3 +; AVX512VL-NEXT: vpmuludq %zmm3, %zmm2, %zmm4 +; AVX512VL-NEXT: vpsrlq $32, %zmm3, %zmm5 +; AVX512VL-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 +; AVX512VL-NEXT: vpaddq %zmm4, %zmm5, %zmm4 +; AVX512VL-NEXT: vpsllq $32, %zmm4, %zmm4 +; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 +; AVX512VL-NEXT: vpaddq %zmm4, %zmm3, %zmm3 +; AVX512VL-NEXT: vpsrlq $32, %zmm3, %zmm3 +; AVX512VL-NEXT: vpmovqd %zmm3, %ymm3 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: subq $-128, %rax +; AVX512VL-NEXT: jne .LBB8_1 +; AVX512VL-NEXT: # %bb.2: # %end +; AVX512VL-NEXT: retq +; +; AVX512DQVL-LABEL: PR49658_sext: +; AVX512DQVL: # %bb.0: # %start +; AVX512DQVL-NEXT: movslq %esi, %rax +; AVX512DQVL-NEXT: vpbroadcastq %rax, %zmm1 +; AVX512DQVL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512DQVL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 +; AVX512DQVL-NEXT: .p2align 4, 0x90 +; AVX512DQVL-NEXT: .LBB8_1: # %loop +; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX512DQVL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm2 +; AVX512DQVL-NEXT: vpmullq %zmm2, %zmm1, %zmm2 +; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 +; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; AVX512DQVL-NEXT: subq $-128, %rax +; AVX512DQVL-NEXT: jne .LBB8_1 +; AVX512DQVL-NEXT: # %bb.2: # %end +; AVX512DQVL-NEXT: retq +start: + %t1 = sext i32 %mul to i64 + %t2 = insertelement <8 x i64> undef, i64 %t1, i32 0 + %mulvec = shufflevector <8 x i64> %t2, <8 x i64> undef, <8 x i32> zeroinitializer + br label %loop +loop: + %loopcnt = phi i64 [ 0, %start ], [ %nextcnt, %loop ] + %sum = phi <8 x i32> [ zeroinitializer, %start ], [ %nextsum, %loop ] + %ptroff = getelementptr inbounds i32, i32* %ptr, i64 %loopcnt + %vptroff = bitcast i32* %ptroff to <8 x i32>* + %v = load <8 x i32>, <8 x i32>* %vptroff, align 4 + %v64 = sext <8 x i32> %v to <8 x i64> + %vmul = mul <8 x i64> %mulvec, %v64 + %vmulhi = ashr <8 x i64> %vmul, + %vtrunc = trunc <8 x i64> %vmulhi to <8 x i32> + %nextsum = add <8 x i32> %vtrunc, %sum + %nextcnt = add i64 %loopcnt, 32 + %isdone = icmp eq i64 %nextcnt, 524288 + br i1 %isdone, label %end, label %loop +end: + ret <8 x i32> %nextsum +} -- GitLab From 3179588947fef91d082e022347d856ec1d18b6ad Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Sun, 21 Mar 2021 12:22:51 +0000 Subject: [PATCH 0284/1000] [X86][AVX] ComputeNumSignBitsForTargetNode - add X86ISD::VBROADCAST handling for scalar sources The target shuffle code handles vector sources, but X86ISD::VBROADCAST can also accept a scalar source for splatting. Added as an extension to PR49658 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 7 ++++ llvm/test/CodeGen/X86/combine-pmuldq.ll | 50 ++++++++----------------- 2 files changed, 22 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c6af291f24d9..76b4aaa11190 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -34545,6 +34545,13 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( return 1; } + case X86ISD::VBROADCAST: { + SDValue Src = Op.getOperand(0); + if (!Src.getSimpleValueType().isVector()) + return DAG.ComputeNumSignBits(Src, Depth + 1); + break; + } + case X86ISD::VSHLI: { SDValue Src = Op.getOperand(0); const APInt &ShiftVal = Op.getConstantOperandAPInt(1); diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 74b953a839f8..4545a084aaaf 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -470,30 +470,17 @@ define <8 x i32> @PR49658_sext(i32* %ptr, i32 %mul) { ; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm2 ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB8_1: # %loop ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm3 -; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm4 -; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm4, %ymm6 -; AVX2-NEXT: vpmuludq %ymm6, %ymm1, %ymm6 -; AVX2-NEXT: vpaddq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 -; AVX2-NEXT: vpmuludq %ymm4, %ymm1, %ymm4 -; AVX2-NEXT: vpaddq %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm5 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm6 -; AVX2-NEXT: vpmuludq %ymm6, %ymm1, %ymm6 -; AVX2-NEXT: vpaddq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vpsllq $32, %ymm5, %ymm5 -; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm3 -; AVX2-NEXT: vpaddq %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm4[2,3],ymm3[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm4, %ymm3 -; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,3],ymm5[1,3],ymm3[5,7],ymm5[5,7] -; AVX2-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpmovsxdq 2097168(%rdi,%rax), %ymm2 +; AVX2-NEXT: vpmovsxdq 2097152(%rdi,%rax), %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm1, %ymm3 +; AVX2-NEXT: vpmuldq %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm3[2,3],ymm2[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,3],ymm4[1,3],ymm2[5,7],ymm4[5,7] +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: subq $-128, %rax ; AVX2-NEXT: jne .LBB8_1 ; AVX2-NEXT: # %bb.2: # %end @@ -505,21 +492,14 @@ define <8 x i32> @PR49658_sext(i32* %ptr, i32 %mul) { ; AVX512VL-NEXT: vpbroadcastq %rax, %zmm1 ; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: movq $-2097152, %rax # imm = 0xFFE00000 -; AVX512VL-NEXT: vpsrlq $32, %zmm1, %zmm2 ; AVX512VL-NEXT: .p2align 4, 0x90 ; AVX512VL-NEXT: .LBB8_1: # %loop ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm3 -; AVX512VL-NEXT: vpmuludq %zmm3, %zmm2, %zmm4 -; AVX512VL-NEXT: vpsrlq $32, %zmm3, %zmm5 -; AVX512VL-NEXT: vpmuludq %zmm5, %zmm1, %zmm5 -; AVX512VL-NEXT: vpaddq %zmm4, %zmm5, %zmm4 -; AVX512VL-NEXT: vpsllq $32, %zmm4, %zmm4 -; AVX512VL-NEXT: vpmuludq %zmm3, %zmm1, %zmm3 -; AVX512VL-NEXT: vpaddq %zmm4, %zmm3, %zmm3 -; AVX512VL-NEXT: vpsrlq $32, %zmm3, %zmm3 -; AVX512VL-NEXT: vpmovqd %zmm3, %ymm3 -; AVX512VL-NEXT: vpaddd %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512VL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 +; AVX512VL-NEXT: vpsrlq $32, %zmm2, %zmm2 +; AVX512VL-NEXT: vpmovqd %zmm2, %ymm2 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; AVX512VL-NEXT: subq $-128, %rax ; AVX512VL-NEXT: jne .LBB8_1 ; AVX512VL-NEXT: # %bb.2: # %end @@ -534,8 +514,8 @@ define <8 x i32> @PR49658_sext(i32* %ptr, i32 %mul) { ; AVX512DQVL-NEXT: .p2align 4, 0x90 ; AVX512DQVL-NEXT: .LBB8_1: # %loop ; AVX512DQVL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512DQVL-NEXT: vpmovsxdq 2097152(%rdi,%rax), %zmm2 -; AVX512DQVL-NEXT: vpmullq %zmm2, %zmm1, %zmm2 +; AVX512DQVL-NEXT: vpmovzxdq {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX512DQVL-NEXT: vpmuldq %zmm2, %zmm1, %zmm2 ; AVX512DQVL-NEXT: vpsrlq $32, %zmm2, %zmm2 ; AVX512DQVL-NEXT: vpmovqd %zmm2, %ymm2 ; AVX512DQVL-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -- GitLab From 8757616de38112a875e7e2ad38d851243ccb5d6b Mon Sep 17 00:00:00 2001 From: Jez Ng Date: Sun, 21 Mar 2021 01:10:04 -0400 Subject: [PATCH 0285/1000] [lld-macho][nfc] Format Options.td Summary: A good chunk of it was mis-indented. Fixed by using the formatting settings from llvm/utils/vim. --- lld/MachO/Options.td | 1850 +++++++++++++++++++++--------------------- 1 file changed, 925 insertions(+), 925 deletions(-) diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td index af8e44e73724..0e9f7b8f7390 100644 --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -37,7 +37,7 @@ def no_lto_legacy_pass_manager : Flag<["--"], "no-lto-legacy-pass-manager">, Group; def time_trace: Flag<["--"], "time-trace">, HelpText<"Record time trace">; def time_trace_granularity: Flag<["--"], "time-trace-granularity">, - HelpText<"Minimum time granularity (in microseconds) traced by time profiler">; + HelpText<"Minimum time granularity (in microseconds) traced by time profiler">; def time_trace_file_eq: Flag<["--"], "time-trace-file=">, HelpText<"Specify time trace output file">; // This is a complete Options.td compiled from Apple's ld(1) manpage @@ -53,1264 +53,1264 @@ def time_trace_file_eq: Flag<["--"], "time-trace-file=">, HelpText<"Specify time def grp_kind : OptionGroup<"kind">, HelpText<"OUTPUT KIND">; def execute : Flag<["-"], "execute">, - HelpText<"Produce a main executable (default)">, - Group; + HelpText<"Produce a main executable (default)">, + Group; def dylib : Flag<["-"], "dylib">, - HelpText<"Produce a shared library">, - Group; + HelpText<"Produce a shared library">, + Group; def bundle : Flag<["-"], "bundle">, - HelpText<"Produce a bundle">, - Group; + HelpText<"Produce a bundle">, + Group; def r : Flag<["-"], "r">, - HelpText<"Merge multiple object files into one, retaining relocations">, - Flags<[HelpHidden]>, - Group; + HelpText<"Merge multiple object files into one, retaining relocations">, + Flags<[HelpHidden]>, + Group; def dylinker : Flag<["-"], "dylinker">, - HelpText<"Produce a dylinker only used when building dyld">, - Flags<[HelpHidden]>, - Group; + HelpText<"Produce a dylinker only used when building dyld">, + Flags<[HelpHidden]>, + Group; def dynamic : Flag<["-"], "dynamic">, - HelpText<"Link dynamically (default)">, - Group; + HelpText<"Link dynamically (default)">, + Group; def static : Flag<["-"], "static">, - HelpText<"Link statically">, - Flags<[HelpHidden]>, - Group; + HelpText<"Link statically">, + Flags<[HelpHidden]>, + Group; def preload : Flag<["-"], "preload">, - HelpText<"Produce an unsegmented binary for embedded systems">, - Flags<[HelpHidden]>, - Group; + HelpText<"Produce an unsegmented binary for embedded systems">, + Flags<[HelpHidden]>, + Group; def arch : Separate<["-"], "arch">, - MetaVarName<"">, - HelpText<"The architecture (e.g. ppc, ppc64, i386, x86_64)">, - Group; + MetaVarName<"">, + HelpText<"The architecture (e.g. ppc, ppc64, i386, x86_64)">, + Group; def o : Separate<["-"], "o">, - MetaVarName<"">, - HelpText<"The name of the output file (default: `a.out')">, - Group; + MetaVarName<"">, + HelpText<"The name of the output file (default: `a.out')">, + Group; def grp_libs : OptionGroup<"libs">, HelpText<"LIBRARIES">; def l : Joined<["-"], "l">, - MetaVarName<"">, - HelpText<"Search for lib.dylib or lib.a on the library search path">, - Group; + MetaVarName<"">, + HelpText<"Search for lib.dylib or lib.a on the library search path">, + Group; def weak_l : Joined<["-"], "weak-l">, - MetaVarName<"">, - HelpText<"Like -l, but mark library and its references as weak imports">, - Group; + MetaVarName<"">, + HelpText<"Like -l, but mark library and its references as weak imports">, + Group; def weak_library : Separate<["-"], "weak_library">, - MetaVarName<"">, - HelpText<"Like bare , but mark library and its references as weak imports">, - Group; + MetaVarName<"">, + HelpText<"Like bare , but mark library and its references as weak imports">, + Group; def reexport_l : Joined<["-"], "reexport-l">, - MetaVarName<"">, - HelpText<"Like -l, but export all symbols of from newly created library">, - Flags<[HelpHidden]>, - Group; + MetaVarName<"">, + HelpText<"Like -l, but export all symbols of from newly created library">, + Flags<[HelpHidden]>, + Group; def reexport_library : Separate<["-"], "reexport_library">, - MetaVarName<"">, - HelpText<"Like bare , but export all symbols of from newly created library">, - Flags<[HelpHidden]>, - Group; + MetaVarName<"">, + HelpText<"Like bare , but export all symbols of from newly created library">, + Flags<[HelpHidden]>, + Group; def upward_l : Joined<["-"], "upward-l">, - MetaVarName<"">, - HelpText<"Like -l, but specify dylib as an upward dependency">, - Flags<[HelpHidden]>, - Group; + MetaVarName<"">, + HelpText<"Like -l, but specify dylib as an upward dependency">, + Flags<[HelpHidden]>, + Group; def upward_library : Separate<["-"], "upward_library">, - MetaVarName<"">, - HelpText<"Like bare , but specify dylib as an upward dependency">, - Flags<[HelpHidden]>, - Group; + MetaVarName<"">, + HelpText<"Like bare , but specify dylib as an upward dependency">, + Flags<[HelpHidden]>, + Group; def L : JoinedOrSeparate<["-"], "L">, - MetaVarName<"">, - HelpText<"Add dir to the library search path">, - Group; + MetaVarName<"">, + HelpText<"Add dir to the library search path">, + Group; def Z : Flag<["-"], "Z">, - HelpText<"Remove standard directories from the library and framework search paths">, - Group; + HelpText<"Remove standard directories from the library and framework search paths">, + Group; def syslibroot : Separate<["-"], "syslibroot">, - MetaVarName<"">, - HelpText<"Prepend to all library and framework search paths">, - Group; + MetaVarName<"">, + HelpText<"Prepend to all library and framework search paths">, + Group; def search_paths_first : Flag<["-"], "search_paths_first">, - HelpText<"Search for lib.dylib and lib.a at each step in traversing search path (default for Xcode 4 and later)">, - Group; + HelpText<"Search for lib.dylib and lib.a at each step in traversing search path (default for Xcode 4 and later)">, + Group; def search_dylibs_first : Flag<["-"], "search_dylibs_first">, - HelpText<"Search for lib.dylib on first pass, then for lib.a on second pass through search path (default for Xcode 3 and earlier)">, - Group; + HelpText<"Search for lib.dylib on first pass, then for lib.a on second pass through search path (default for Xcode 3 and earlier)">, + Group; def framework : Separate<["-"], "framework">, - MetaVarName<"">, - HelpText<"Search for .framework/ on the framework search path">, - Group; + MetaVarName<"">, + HelpText<"Search for .framework/ on the framework search path">, + Group; def weak_framework : Separate<["-"], "weak_framework">, - MetaVarName<"">, - HelpText<"Like -framework , but mark framework and its references as weak imports">, - Group; + MetaVarName<"">, + HelpText<"Like -framework , but mark framework and its references as weak imports">, + Group; def reexport_framework : Separate<["-"], "reexport_framework">, - MetaVarName<"">, - HelpText<"Like -framework , but export all symbols of from the newly created library">, - Flags<[HelpHidden]>, - Group; + MetaVarName<"">, + HelpText<"Like -framework , but export all symbols of from the newly created library">, + Flags<[HelpHidden]>, + Group; def upward_framework : Separate<["-"], "upward_framework">, - MetaVarName<"">, - HelpText<"Like -framework , but specify the framework as an upward dependency">, - Flags<[HelpHidden]>, - Group; + MetaVarName<"">, + HelpText<"Like -framework , but specify the framework as an upward dependency">, + Flags<[HelpHidden]>, + Group; def F : JoinedOrSeparate<["-"], "F">, - MetaVarName<"">, - HelpText<"Add dir to the framework search path">, - Group; + MetaVarName<"">, + HelpText<"Add dir to the framework search path">, + Group; def all_load : Flag<["-"], "all_load">, - HelpText<"Load all members of all static archive libraries">, - Group; + HelpText<"Load all members of all static archive libraries">, + Group; def ObjC : Flag<["-"], "ObjC">, - HelpText<"Load all members of static archives that are an Objective-C class or category.">, - Group; + HelpText<"Load all members of static archives that are an Objective-C class or category.">, + Group; def force_load : Separate<["-"], "force_load">, - MetaVarName<"">, - HelpText<"Load all members static archive library at ">, - Group; + MetaVarName<"">, + HelpText<"Load all members static archive library at ">, + Group; def grp_content : OptionGroup<"content">, HelpText<"ADDITIONAL CONTENT">; def sectcreate : MultiArg<["-"], "sectcreate", 3>, - MetaVarName<"
">, - HelpText<"Create
in from the contents of ">, - Group; + MetaVarName<"
">, + HelpText<"Create
in from the contents of ">, + Group; def segcreate : MultiArg<["-"], "segcreate", 3>, - MetaVarName<"
">, - Alias, - HelpText<"Alias for -sectcreate">, - Flags<[HelpHidden]>, - Group; + MetaVarName<"
">, + Alias, + HelpText<"Alias for -sectcreate">, + Flags<[HelpHidden]>, + Group; def filelist : Separate<["-"], "filelist">, - MetaVarName<"">, - HelpText<"Read names of files to link from ">, - Group; + MetaVarName<"">, + HelpText<"Read names of files to link from ">, + Group; def dtrace : Separate<["-"], "dtrace">, - MetaVarName<"