From 7189084bef9b3d17039afbfd582dbc93fe14af55 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 5 Mar 2015 17:14:04 +0000 Subject: [PATCH] [DagCombiner] Allow shuffles to merge through bitcasts Currently shuffles may only be combined if they are of the same type, despite the fact that bitcasts are often introduced in between shuffle nodes (e.g. x86 shuffle type widening). This patch allows a single input shuffle to peek through bitcasts and if the input is another shuffle will merge them, shuffling using the smallest sized type, and re-applying the bitcasts at the inputs and output instead. Dropped old ShuffleToZext test - this patch removes the use of the zext and vector-zext.ll covers these anyhow. Differential Revision: http://reviews.llvm.org/D7939 llvm-svn: 231380 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 83 +++++++++++++++++++ .../CodeGen/X86/2013-02-12-ShuffleToZext.ll | 14 ---- .../CodeGen/X86/vector-shuffle-128-v16.ll | 19 +++++ .../test/CodeGen/X86/vector-shuffle-128-v2.ll | 19 +++++ .../test/CodeGen/X86/vector-shuffle-128-v4.ll | 17 ++++ .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 19 +++++ llvm/test/CodeGen/X86/vector-shuffle-mmx.ll | 11 ++- 7 files changed, 162 insertions(+), 20 deletions(-) delete mode 100644 llvm/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 4a074a03d9a7..4cf34346184d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11877,6 +11877,89 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return V; } + // If this shuffle only has a single input that is a bitcasted shuffle, + // attempt to merge the 2 shuffles and suitably bitcast the inputs/output + // back to their original types. + if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && + N1.getOpcode() == ISD::UNDEF && Level < AfterLegalizeVectorOps && + TLI.isTypeLegal(VT)) { + + // Peek through the bitcast only if there is one user. + SDValue BC0 = N0; + while (BC0.getOpcode() == ISD::BITCAST) { + if (!BC0.hasOneUse()) + break; + BC0 = BC0.getOperand(0); + } + + auto ScaleShuffleMask = [](ArrayRef Mask, int Scale) { + if (Scale == 1) + return SmallVector(Mask.begin(), Mask.end()); + + SmallVector NewMask; + for (int M : Mask) + for (int s = 0; s != Scale; ++s) + NewMask.push_back(M < 0 ? -1 : Scale * M + s); + return NewMask; + }; + + if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { + EVT SVT = VT.getScalarType(); + EVT InnerVT = BC0->getValueType(0); + EVT InnerSVT = InnerVT.getScalarType(); + + // Determine which shuffle works with the smaller scalar type. + EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT; + EVT ScaleSVT = ScaleVT.getScalarType(); + + if (TLI.isTypeLegal(ScaleVT) && + 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) && + 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) { + + int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits(); + int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits(); + + // Scale the shuffle masks to the smaller scalar type. + ShuffleVectorSDNode *InnerSVN = cast(BC0); + SmallVector InnerMask = + ScaleShuffleMask(InnerSVN->getMask(), InnerScale); + SmallVector OuterMask = + ScaleShuffleMask(SVN->getMask(), OuterScale); + + // Merge the shuffle masks. + SmallVector NewMask; + for (int M : OuterMask) + NewMask.push_back(M < 0 ? -1 : InnerMask[M]); + + // Test for shuffle mask legality over both commutations. + SDValue SV0 = BC0->getOperand(0); + SDValue SV1 = BC0->getOperand(1); + bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); + if (!LegalMask) { + for (int i = 0, e = (int)NewMask.size(); i != e; ++i) { + int idx = NewMask[i]; + if (idx < 0) + continue; + else if (idx < e) + NewMask[i] = idx + e; + else + NewMask[i] = idx - e; + } + std::swap(SV0, SV1); + LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT); + } + + if (LegalMask) { + SV0 = DAG.getNode(ISD::BITCAST, SDLoc(N), ScaleVT, SV0); + SV1 = DAG.getNode(ISD::BITCAST, SDLoc(N), ScaleVT, SV1); + return DAG.getNode( + ISD::BITCAST, SDLoc(N), VT, + DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask)); + } + } + } + } + // Canonicalize shuffles according to rules: // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A) // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B) diff --git a/llvm/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll b/llvm/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll deleted file mode 100644 index 614ccda5e250..000000000000 --- a/llvm/test/CodeGen/X86/2013-02-12-ShuffleToZext.ll +++ /dev/null @@ -1,14 +0,0 @@ -; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx -mtriple=x86_64-pc-win32 | FileCheck %s - -; CHECK: test -; CHECK: vpmovzxwd -; CHECK: vpmovzxwd -define void @test(<4 x i64> %a, <4 x i16>* %buf) { - %ex1 = extractelement <4 x i64> %a, i32 0 - %ex2 = extractelement <4 x i64> %a, i32 1 - %x1 = bitcast i64 %ex1 to <4 x i16> - %x2 = bitcast i64 %ex2 to <4 x i16> - %Sh = shufflevector <4 x i16> %x1, <4 x i16> %x2, <4 x i32> - store <4 x i16> %Sh, <4 x i16>* %buf, align 1 - ret void -} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index c271622e8a6e..01b8972e13de 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -1336,3 +1336,22 @@ define <16 x i8> @shuffle_v16i8_uu_02_03_zz_uu_06_07_zz_uu_10_11_zz_uu_14_15_zz( %shuffle = shufflevector <16 x i8> %a, <16 x i8> zeroinitializer, <16 x i32> ret <16 x i8> %shuffle } + +define <16 x i8> @shuffle_v16i8_bitcast_unpack(<16 x i8> %a, <16 x i8> %b) { +; SSE-LABEL: shuffle_v16i8_bitcast_unpack: +; SSE: # BB#0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v16i8_bitcast_unpack: +; AVX: # BB#0: +; AVX-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: retq + %shuffle8 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> + %bitcast32 = bitcast <16 x i8> %shuffle8 to <4 x float> + %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> + %bitcast16 = bitcast <4 x float> %shuffle32 to <8 x i16> + %shuffle16 = shufflevector <8 x i16> %bitcast16, <8 x i16> undef, <8 x i32> + %bitcast8 = bitcast <8 x i16> %shuffle16 to <16 x i8> + ret <16 x i8> %bitcast8 +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll index 30eceacb734f..ee68df581bfd 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -810,6 +810,25 @@ define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) { ret <2 x double> %shuffle } +define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) { +; SSE-LABEL: shuffle_v2f64_bitcast_1z: +; SSE: # BB#0: +; SSE-NEXT: xorpd %xmm1, %xmm1 +; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v2f64_bitcast_1z: +; AVX: # BB#0: +; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0] +; AVX-NEXT: retq + %shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> + %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float> + %shuffle32 = shufflevector <4 x float> %bitcast32, <4 x float> undef, <4 x i32> + %bitcast64 = bitcast <4 x float> %shuffle32 to <2 x double> + ret <2 x double> %bitcast64 +} + define <2 x i64> @insert_reg_and_zero_v2i64(i64 %a) { ; SSE-LABEL: insert_reg_and_zero_v2i64: ; SSE: # BB#0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index 2021905a2238..8612a5afa3d2 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1574,6 +1574,23 @@ define <4 x i32> @shuffle_v4i32_0zz3(<4 x i32> %a) { ret <4 x i32> %shuffle } +define <4 x i32> @shuffle_v4i32_bitcast_0415(<4 x i32> %a, <4 x i32> %b) { +; SSE-LABEL: shuffle_v4i32_bitcast_0415: +; SSE: # BB#0: +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE-NEXT: retq +; +; AVX-LABEL: shuffle_v4i32_bitcast_0415: +; AVX: # BB#0: +; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; AVX-NEXT: retq + %shuffle32 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> + %bitcast64 = bitcast <4 x i32> %shuffle32 to <2 x double> + %shuffle64 = shufflevector <2 x double> %bitcast64, <2 x double> undef, <2 x i32> + %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x i32> + ret <4 x i32> %bitcast32 +} + define <4 x i32> @insert_reg_and_zero_v4i32(i32 %a) { ; SSE-LABEL: insert_reg_and_zero_v4i32: ; SSE: # BB#0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index 0ac9a2b62eb9..8aca67c0bdb3 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -922,3 +922,22 @@ define <4 x double> @splat_v4f64(<2 x double> %r) { %1 = shufflevector <2 x double> %r, <2 x double> undef, <4 x i32> zeroinitializer ret <4 x double> %1 } + +define <4 x double> @bitcast_v4f64_0426(<4 x double> %a, <4 x double> %b) { +; AVX1-LABEL: bitcast_v4f64_0426: +; AVX1: # BB#0: +; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: bitcast_v4f64_0426: +; AVX2: # BB#0: +; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX2-NEXT: retq + %shuffle64 = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> + %bitcast32 = bitcast <4 x double> %shuffle64 to <8 x float> + %shuffle32 = shufflevector <8 x float> %bitcast32, <8 x float> undef, <8 x i32> + %bitcast16 = bitcast <8 x float> %shuffle32 to <16 x i16> + %shuffle16 = shufflevector <16 x i16> %bitcast16, <16 x i16> undef, <16 x i32> + %bitcast64 = bitcast <16 x i16> %shuffle16 to <4 x double> + ret <4 x double> %bitcast64 +} diff --git a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll index c585414d92d1..094722d26808 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-mmx.ll @@ -8,14 +8,14 @@ define void @test0(<1 x i64>* %x) { ; X32: ## BB#0: ## %entry ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X32-NEXT: movlpd %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test0: ; X64: ## BB#0: ## %entry ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,3] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: retq entry: @@ -84,16 +84,15 @@ define void @test2() nounwind { ; X32: ## BB#0: ## %entry ; X32-NEXT: movl L_tmp_V2i$non_lazy_ptr, %eax ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0,0] -; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; X32-NEXT: movlpd %xmm0, (%eax) +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; X32-NEXT: movlps %xmm0, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test2: ; X64: ## BB#0: ## %entry ; X64-NEXT: movq _tmp_V2i@{{.*}}(%rip), %rax ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; X64-NEXT: movq %xmm0, (%rax) ; X64-NEXT: retq entry: -- GitLab