From 1542d5a00ae870cd6996496fac29f8818b042efd Mon Sep 17 00:00:00 2001 From: Nadav Rotem Date: Thu, 11 Aug 2011 16:41:21 +0000 Subject: [PATCH] [AVX] If the data which is going to be saved is already in two XMM registers (for example, after integer operation), do not pack the registers into a YMM before saving. Its better to save as two XMM registers. Before: vinsertf128 $1, %xmm3, %ymm0, %ymm3 vinsertf128 $0, %xmm1, %ymm3, %ymm1 vmovaps %ymm1, 416(%rsp) After: vmovaps %xmm3, 416+16(%rsp) vmovaps %xmm1, 416(%rsp) llvm-svn: 137308 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 23 +++++++++++++++++++++++ llvm/test/CodeGen/X86/avx-insert.ll | 17 +++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 llvm/test/CodeGen/X86/avx-insert.ll diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index be3ecd7e531b..94faae5b556b 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12589,6 +12589,29 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, EVT VT = St->getValue().getValueType(); EVT StVT = St->getMemoryVT(); DebugLoc dl = St->getDebugLoc(); + SDValue StoredVal = St->getOperand(1); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + // If we are saving a concatination of two XMM registers, perform two stores. + if (VT.getSizeInBits() == 256 && + StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS && + StoredVal.getNumOperands() == 2) { + + SDValue Value0 = StoredVal.getOperand(0); + SDValue Value1 = StoredVal.getOperand(1); + + SDValue Stride = DAG.getConstant(16, TLI.getPointerTy()); + SDValue Ptr0 = St->getBasePtr(); + SDValue Ptr1 = DAG.getNode(ISD::ADD, dl, Ptr0.getValueType(), Ptr0, Stride); + + SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, + St->getPointerInfo(), St->isVolatile(), + St->isNonTemporal(), St->getAlignment()); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); + } // Optimize trunc store (of multiple scalars) to shuffle and store. // First, pack all of the elements in one place. Next, store to memory diff --git a/llvm/test/CodeGen/X86/avx-insert.ll b/llvm/test/CodeGen/X86/avx-insert.ll new file mode 100644 index 000000000000..d9eae03eee75 --- /dev/null +++ b/llvm/test/CodeGen/X86/avx-insert.ll @@ -0,0 +1,17 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +; It is faster to make two saves, if the data is already in XMM registers. For +; example, after making an integer operation. +define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp { +entry: + ; CHECK: movaps + ; CHECK: movaps + ; CHECK: movaps + ; CHECK: movaps + %A = load <4 x i32>* %Ap + %B = load <4 x i32>* %Bp + %Z = shufflevector <4 x i32>%A, <4 x i32>%B, <8 x i32> + store <8 x i32> %Z, <8 x i32>* %P, align 16 + ret void +} + -- GitLab