Newer
Older
Evan Cheng
committed
int64_t Offset = 0;
if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
FI = FINode->getIndex();
Offset = 0;
} else if (Ptr.getOpcode() == ISD::ADD &&
isa<ConstantSDNode>(Ptr.getOperand(1)) &&
isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
Offset = Ptr.getConstantOperandVal(1);
Ptr = Ptr.getOperand(0);
} else {
return SDValue();
}
SDValue Chain = LD->getChain();
// Make sure the stack object alignment is at least 16.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
if (DAG.InferPtrAlignment(Ptr) < 16) {
if (MFI->isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
// If someone *really* cares about this. That's the way to implement it.
return SDValue();
Evan Cheng
committed
} else {
MFI->setObjectAlignment(FI, 16);
}
}
// (Offset % 16) must be multiple of 4. Then address is then
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
if ((Offset % 16) & 3)
return SDValue();
int64_t StartOffset = Offset & ~15;
if (StartOffset)
Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
int EltNo = (Offset - StartOffset) >> 2;
int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(StartOffset),
Evan Cheng
committed
// Canonicalize it to a v4i32 shuffle.
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
return DAG.getNode(ISD::BITCAST, dl, VT,
Evan Cheng
committed
DAG.getVectorShuffle(MVT::v4i32, dl, V1,
DAG.getUNDEF(MVT::v4i32),&Mask[0]));
Evan Cheng
committed
}
return SDValue();
}
/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
/// vector of type 'VT', see if the elements can be replaced by a single large
/// load which has the same value as a build_vector whose operands are 'elts'.
///
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
Nate Begeman
committed
static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
DebugLoc &DL, SelectionDAG &DAG) {
Nate Begeman
committed
EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
Nate Begeman
committed
LoadSDNode *LDBase = NULL;
unsigned LastLoadedElt = -1U;
// For each element in the initializer, see if we've found a load or an undef.
// If we don't find an initial load element, or later load elements are
Nate Begeman
committed
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Elts[i];
Nate Begeman
committed
if (!Elt.getNode() ||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return SDValue();
if (!LDBase) {
if (Elt.getNode()->getOpcode() == ISD::UNDEF)
return SDValue();
LDBase = cast<LoadSDNode>(Elt.getNode());
LastLoadedElt = i;
continue;
}
if (Elt.getOpcode() == ISD::UNDEF)
continue;
LoadSDNode *LD = cast<LoadSDNode>(Elt);
if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
return SDValue();
LastLoadedElt = i;
}
// If we have found an entire vector of loads and undefs, then return a large
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
Nate Begeman
committed
if (LastLoadedElt == NumElems - 1) {
if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(),
Nate Begeman
committed
LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(),
Nate Begeman
committed
LDBase->isVolatile(), LDBase->isNonTemporal(),
LDBase->getAlignment());
} else if (NumElems == 4 && LastLoadedElt == 1) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
Ops, 2, MVT::i32,
LDBase->getMemOperand());
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
Nate Begeman
committed
}
return SDValue();
}
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
// All zero's are handled with pxor in SSE2 and above, xorps in SSE1.
// All one's are handled with pcmpeqd. In AVX, zero's are handled with
Bruno Cardoso Lopes
committed
// vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd
// is present, so AllOnes is ignored.
if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
(Op.getValueType().getSizeInBits() != 256 &&
ISD::isBuildVectorAllOnes(Op.getNode()))) {
// 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
// eliminated on x86-32 hosts.
Gabor Greif
committed
if (ISD::isBuildVectorAllOnes(Op.getNode()))
return getOnesVector(Op.getValueType(), DAG, dl);
return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
Owen Anderson
committed
EVT VT = Op.getValueType();
EVT ExtVT = VT.getVectorElementType();
unsigned EVTBits = ExtVT.getSizeInBits();
unsigned NumElems = Op.getNumOperands();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
unsigned NonZeros = 0;
bool IsAllConstants = true;
for (unsigned i = 0; i < NumElems; ++i) {
Evan Cheng
committed
if (Elt.getOpcode() == ISD::UNDEF)
continue;
Values.insert(Elt);
if (Elt.getOpcode() != ISD::Constant &&
Elt.getOpcode() != ISD::ConstantFP)
IsAllConstants = false;
Evan Cheng
committed
if (X86::isZeroNode(Elt))
Evan Cheng
committed
NumZero++;
else {
NonZeros |= (1 << i);
NumNonZero++;
}
}
// All undef vector. Return an UNDEF. All zero vectors were handled above.
if (NumNonZero == 0)
return DAG.getUNDEF(VT);
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
unsigned Idx = CountTrailingZeros_32(NonZeros);
// If this is an insertion of an i64 value on x86-32, and if the top bits of
// the value are obviously zero, truncate the value to i32 and do the
// insertion that way. Only do this if the value is non-constant or if the
// value is a constant being inserted into element 0. It is cheaper to do
// a constant pool load than it is to do a movd + shuffle.
Owen Anderson
committed
if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
(!IsAllConstants || Idx == 0)) {
if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
// Handle SSE only.
assert(VT == MVT::v2i64 && "Expected an SSE value type!");
EVT VecVT = MVT::v4i32;
unsigned VecElts = 4;
// Truncate the value (which may itself be a constant) to i32, and
// convert it to a vector with movd (S2V+shuffle to zero extend).
Owen Anderson
committed
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
Evan Cheng
committed
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
Subtarget->hasSSE2(), DAG);
// Now we have our 32-bit value zero extended in the low element of
// a vector. If Idx != 0, swizzle it into place.
if (Idx != 0) {
SmallVector<int, 4> Mask;
Mask.push_back(Idx);
for (unsigned i = 1; i != VecElts; ++i)
Mask.push_back(i);
Item = DAG.getVectorShuffle(VecVT, dl, Item,
&Mask[0]);
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item);
// If we have a constant or non-constant insertion into the low element of
// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
// the rest of the elements. This will be matched as movd/movq/movss/movsd
// depending on what the source datatype is.
if (Idx == 0) {
if (NumZero == 0) {
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
Owen Anderson
committed
} else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
DAG);
Owen Anderson
committed
} else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
EVT MiddleVT = MVT::v4i32;
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
Subtarget->hasSSE2(), DAG);
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
// Is it a vector logical left shift?
if (NumElems == 2 && Idx == 1 &&
Evan Cheng
committed
X86::isZeroNode(Op.getOperand(0)) &&
!X86::isZeroNode(Op.getOperand(1))) {
unsigned NumBits = VT.getSizeInBits();
return getVShift(true, VT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
VT, Op.getOperand(1)),
}
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
// Otherwise, if this is a vector with i32 or f32 elements, and the element
// is a non-constant being inserted into an element other than the low one,
// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
// movd/movss) to move this into the low element, then shuffle it into
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a shuffle of zero and zero-extended scalar to vector.
Evan Cheng
committed
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
Subtarget->hasSSE2(), DAG);
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i < NumElems; i++)
MaskVec.push_back(i == Idx ? 0 : 1);
return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
}
}
// Splat is obviously ok. Let legalizer expand it to a shuffle.
Evan Cheng
committed
if (Values.size() == 1) {
if (EVTBits == 32) {
// Instead of a shuffle like this:
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
// Check if it's possible to issue this instead.
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
unsigned Idx = CountTrailingZeros_32(NonZeros);
SDValue Item = Op.getOperand(Idx);
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
}
Evan Cheng
committed
}
// A vector full of immediates; various special cases are already
// handled, so this is best done with a single constant-pool load.
if (IsAllConstants)
// Let legalizer expand 2-wide build_vectors.
Evan Cheng
committed
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
unsigned Idx = CountTrailingZeros_32(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Evan Cheng
committed
Op.getOperand(Idx));
Evan Cheng
committed
return getShuffleVectorZeroOrUndef(V2, Idx, true,
Subtarget->hasSSE2(), DAG);
Evan Cheng
committed
}
Evan Cheng
committed
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16) {
SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
Gabor Greif
committed
if (V.getNode()) return V;
if (EVTBits == 16 && NumElems == 8) {
SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
Gabor Greif
committed
if (V.getNode()) return V;
}
// If element VT is == 32 bits, turn it into a number of shuffles.
V.resize(NumElems);
if (NumElems == 4 && NumZero > 0) {
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1 << i));
if (isZero)
V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
for (unsigned i = 0; i < 2; ++i) {
switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
default: break;
case 0:
V[i] = V[i*2]; // Must be a zero vector.
break;
case 1:
V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
break;
case 2:
V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
break;
case 3:
V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
break;
}
}
SmallVector<int, 8> MaskVec;
bool Reverse = (NonZeros & 0x3) == 2;
for (unsigned i = 0; i < 2; ++i)
MaskVec.push_back(Reverse ? 1-i : i);
Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
for (unsigned i = 0; i < 2; ++i)
MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
Nate Begeman
committed
if (Values.size() > 1 && VT.getSizeInBits() == 128) {
// Check for a build vector of consecutive loads.
for (unsigned i = 0; i < NumElems; ++i)
V[i] = Op.getOperand(i);
Nate Begeman
committed
// Check for elements which are consecutive loads.
SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
if (LD.getNode())
return LD;
// For SSE 4.1, use insertps to put the high elements into the low element.
Nate Begeman
committed
if (getSubtarget()->hasSSE41()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
else
Result = DAG.getUNDEF(VT);
for (unsigned i = 1; i < NumElems; ++i) {
if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
Op.getOperand(i), DAG.getIntPtrConstant(i));
}
return Result;
}
// Otherwise, expand into a number of unpckl*, start by extending each of
// our (non-undef) elements to the full vector width with the element in the
// bottom slot of the vector (which generates no code for SSE).
for (unsigned i = 0; i < NumElems; ++i) {
if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
else
V[i] = DAG.getUNDEF(VT);
}
// Next, we iteratively mix elements, e.g. for v4f32:
// Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
// : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
// Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
unsigned EltStride = NumElems >> 1;
while (EltStride != 0) {
for (unsigned i = 0; i < EltStride; ++i) {
// If V[i+EltStride] is undef and this is the first round of mixing,
// then it is safe to just drop this shuffle: V[i] is already in the
// right place, the one element (since it's the first round) being
// inserted as undef can be dropped. This isn't safe for successive
// rounds because they will permute elements within both vectors.
if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
EltStride == NumElems/2)
continue;
V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
EltStride >>= 1;
}
return V[0];
}
X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
// We support concatenate two MMX registers and place them in a MMX
// register. This is better than doing a stack convert.
DebugLoc dl = Op.getDebugLoc();
EVT ResVT = Op.getValueType();
assert(Op.getNumOperands() == 2);
assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
int Mask[2];
Wesley Peck
committed
SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
InVec = Op.getOperand(1);
if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
unsigned NumElts = ResVT.getVectorNumElements();
Wesley Peck
committed
VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
} else {
Wesley Peck
committed
InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
Mask[0] = 0; Mask[1] = 2;
VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
}
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
// v8i16 shuffles - Prefer shuffles in the following order:
// 1. [all] pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
// 3. [ssse3] 2 x pshufb + 1 x por
// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
Bruno Cardoso Lopes
committed
SDValue
X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
SelectionDAG &DAG) const {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
SmallVector<int, 8> MaskVals;
// Determine if more than 1 of the words in each of the low and high quadwords
// of the result come from the same quadword of one of the two inputs. Undef
// mask values count as coming from any quadword, for better codegen.
SmallVector<unsigned, 4> LoQuad(4);
SmallVector<unsigned, 4> HiQuad(4);
BitVector InputQuads(4);
for (unsigned i = 0; i < 8; ++i) {
SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
int EltIdx = SVOp->getMaskElt(i);
MaskVals.push_back(EltIdx);
if (EltIdx < 0) {
++Quad[0];
++Quad[1];
++Quad[2];
++Quad[3];
continue;
}
++Quad[EltIdx / 4];
InputQuads.set(EltIdx / 4);
}
unsigned MaxQuad = 1;
for (unsigned i = 0; i < 4; ++i) {
if (LoQuad[i] > MaxQuad) {
BestLoQuad = i;
MaxQuad = LoQuad[i];
}
}
MaxQuad = 1;
for (unsigned i = 0; i < 4; ++i) {
if (HiQuad[i] > MaxQuad) {
BestHiQuad = i;
MaxQuad = HiQuad[i];
}
}
// For SSSE3, If all 8 words of the result come from only 1 quadword of each
// of the two input vectors, shuffle them into one input vector so only a
// single pshufb instruction is necessary. If There are more than 2 input
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
Bruno Cardoso Lopes
committed
if (Subtarget->hasSSSE3()) {
if (InputQuads.count() == 2 && V1Used && V2Used) {
BestLoQuad = InputQuads.find_first();
BestHiQuad = InputQuads.find_next(BestLoQuad);
}
if (InputQuads.count() > 2) {
BestLoQuad = -1;
BestHiQuad = -1;
}
}
// If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
// the shuffle mask. If a quad is scored as -1, that means that it contains
// words from all 4 input quadwords.
SDValue NewV;
if (BestLoQuad >= 0 || BestHiQuad >= 0) {
SmallVector<int, 8> MaskV;
MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
Wesley Peck
committed
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
// Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
// source words for the shuffle, to aid later transformations.
bool AllWordsInNewV = true;
Mon P Wang
committed
bool InOrder[2] = { true, true };
for (unsigned i = 0; i != 8; ++i) {
int idx = MaskVals[i];
Mon P Wang
committed
if (idx != (int)i)
InOrder[i/4] = false;
if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
continue;
AllWordsInNewV = false;
break;
}
bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
if (AllWordsInNewV) {
for (int i = 0; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0)
continue;
idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
if ((idx != i) && idx < 4)
pshufhw = false;
if ((idx != i) && idx > 3)
pshuflw = false;
}
V1 = NewV;
V2Used = false;
BestLoQuad = 0;
BestHiQuad = 1;
}
// If we've eliminated the use of V2, and the new mask is a pshuflw or
// pshufhw, that's as cheap as it gets. Return the new shuffle.
Mon P Wang
committed
if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
unsigned TargetMask = 0;
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
Owen Anderson
committed
DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
X86::getShufflePSHUFLWImmediate(NewV.getNode());
V1 = NewV.getOperand(0);
Bruno Cardoso Lopes
committed
return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
Bruno Cardoso Lopes
committed
if (Subtarget->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If we have elements from both input vectors, set the high bit of the
// shuffle mask element to zero out elements that come from V2 in the V1
// mask, and elements that come from V1 in the V2 mask, so that the two
// results can be OR'd together.
bool TwoInputs = V1Used && V2Used;
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
if (TwoInputs && (EltIdx >= 16)) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
pshufbMask.clear();
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
if (EltIdx < 16) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
Bill Wendling
committed
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
}
Wesley Peck
committed
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
}
// If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
// and update MaskVals with new element order.
BitVector InOrder(8);
if (BestLoQuad >= 0) {
SmallVector<int, 8> MaskV;
for (int i = 0; i != 4; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
MaskV.push_back(-1);
InOrder.set(i);
} else if ((idx / 4) == BestLoQuad) {
MaskV.push_back(idx & 3);
InOrder.set(i);
} else {
MaskV.push_back(-1);
}
for (unsigned i = 4; i != 8; ++i)
MaskV.push_back(i);
Owen Anderson
committed
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
Bruno Cardoso Lopes
committed
if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFLWImmediate(NewV.getNode()),
DAG);
// If BestHi >= 0, generate a pshufhw to put the high elements in order,
// and update MaskVals with the new element order.
if (BestHiQuad >= 0) {
SmallVector<int, 8> MaskV;
for (unsigned i = 0; i != 4; ++i)
MaskV.push_back(i);
for (unsigned i = 4; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
MaskV.push_back(-1);
InOrder.set(i);
} else if ((idx / 4) == BestHiQuad) {
MaskV.push_back((idx & 3) + 4);
InOrder.set(i);
} else {
MaskV.push_back(-1);
Owen Anderson
committed
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
Bruno Cardoso Lopes
committed
if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFHWImmediate(NewV.getNode()),
DAG);
// In case BestHi & BestLo were both -1, which means each quadword has a word
// from each of the four input quadwords, calculate the InOrder bitvector now
// before falling through to the insert/extract cleanup.
if (BestLoQuad == -1 && BestHiQuad == -1) {
NewV = V1;
for (int i = 0; i != 8; ++i)
if (MaskVals[i] < 0 || MaskVals[i] == i)
InOrder.set(i);
}
// The other elements are put in the right place using pextrw and pinsrw.
for (unsigned i = 0; i != 8; ++i) {
if (InOrder[i])
continue;
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
SDValue ExtOp = (EltIdx < 8)
Owen Anderson
committed
? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
DAG.getIntPtrConstant(EltIdx))
Owen Anderson
committed
: DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
DAG.getIntPtrConstant(EltIdx - 8));
Owen Anderson
committed
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
DAG.getIntPtrConstant(i));
// v16i8 shuffles - Prefer shuffles in the following order:
// 1. [ssse3] 1 x pshufb
// 2. [ssse3] 2 x pshufb + 1 x por
// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
static
SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG,
const X86TargetLowering &TLI) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
SmallVector<int, 16> MaskVals;
SVOp->getMask(MaskVals);
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
// present, fall back to case 3.
// FIXME: kill V2Only once shuffles are canonizalized by getNode.
bool V1Only = true;
bool V2Only = true;
for (unsigned i = 0; i < 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
if (EltIdx < 16)
V2Only = false;
else
V1Only = false;
}
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
if (TLI.getSubtarget()->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If all result elements are from one input vector, then only translate
// undef mask values to 0x80 (zero out result) in the pshufb mask.
//
// Otherwise, we have elements from both input vectors, and must zero out
// elements that come from V2 in the first mask, and V1 in the second mask
// so that we can OR them together.
bool TwoInputs = !(V1Only || V2Only);
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
}
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
}
// If all the elements are from V2, assign it to V1 and return after
// building the first pshufb.
if (V2Only)
V1 = V2;
Owen Anderson
committed
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
if (!TwoInputs)
return V1;
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
pshufbMask.clear();
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 16) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
Owen Anderson
committed
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
// No SSSE3 - Calculate in place words and then fix all out of place words
// With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
// the 16 different words that comprise the two doublequadword input vectors.
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
SDValue NewV = V2Only ? V2 : V1;
for (int i = 0; i != 8; ++i) {
int Elt0 = MaskVals[i*2];
int Elt1 = MaskVals[i*2+1];
// This word of the result is all undef, skip it.
if (Elt0 < 0 && Elt1 < 0)
continue;
// This word of the result is already in the correct place, skip it.
if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
continue;
if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
continue;
SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
SDValue InsElt;
Mon P Wang
committed
// If Elt0 and Elt1 are defined, are consecutive, and can be load
// using a single extract together, load it and store it.
if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
Owen Anderson
committed
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
Mon P Wang
committed
DAG.getIntPtrConstant(Elt1 / 2));
Owen Anderson
committed
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
Mon P Wang
committed
DAG.getIntPtrConstant(i));
continue;
}
// If Elt1 is defined, extract it from the appropriate source. If the
Mon P Wang
committed
// source byte is not also odd, shift the extracted word left 8 bits
// otherwise clear the bottom 8 bits if we need to do an or.
Owen Anderson
committed
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
DAG.getIntPtrConstant(Elt1 / 2));
if ((Elt1 & 1) == 0)
Owen Anderson
committed
InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
DAG.getConstant(8, TLI.getShiftAmountTy()));
Mon P Wang
committed
else if (Elt0 >= 0)
Owen Anderson
committed
InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
DAG.getConstant(0xFF00, MVT::i16));
}
// If Elt0 is defined, extract it from the appropriate source. If the
// source byte is not also even, shift the extracted word right 8 bits. If
// Elt1 was also defined, OR the extracted values together before
// inserting them in the result.
if (Elt0 >= 0) {
Owen Anderson
committed
SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
if ((Elt0 & 1) != 0)
Owen Anderson
committed
InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
DAG.getConstant(8, TLI.getShiftAmountTy()));
Mon P Wang
committed
else if (Elt1 >= 0)
Owen Anderson
committed
InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
DAG.getConstant(0x00FF, MVT::i16));
InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
}
Owen Anderson
committed
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
DAG.getIntPtrConstant(i));
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
}
Evan Cheng
committed
/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
Evan Cheng
committed
/// done when every pair / quad of shuffle mask elements point to elements in
/// the right sequence. e.g.
/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
Bruno Cardoso Lopes
committed
SelectionDAG &DAG, DebugLoc dl) {
Owen Anderson
committed
EVT VT = SVOp->getValueType(0);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
unsigned NumElems = VT.getVectorNumElements();
Evan Cheng
committed
unsigned NewWidth = (NumElems == 4) ? 2 : 4;
EVT NewVT;
Owen Anderson
committed
switch (VT.getSimpleVT().SimpleTy) {
default: assert(false && "Unexpected!");
Owen Anderson
committed
case MVT::v4f32: NewVT = MVT::v2f64; break;
case MVT::v4i32: NewVT = MVT::v2i64; break;
case MVT::v8i16: NewVT = MVT::v4i32; break;
case MVT::v16i8: NewVT = MVT::v4i32; break;
Evan Cheng
committed
}
int Scale = NumElems / NewWidth;
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i < NumElems; i += Scale) {
int StartIdx = -1;
for (int j = 0; j < Scale; ++j) {
int EltIdx = SVOp->getMaskElt(i+j);
if (EltIdx < 0)
continue;
if (StartIdx == -1)
StartIdx = EltIdx - (EltIdx % Scale);
if (EltIdx != StartIdx + j)
}
if (StartIdx == -1)
MaskVec.push_back(-1);
else
MaskVec.push_back(StartIdx / Scale);
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
/// getVZextMovL - Return a zero-extending vector move low node.
Evan Cheng
committed
///
Owen Anderson
committed
static SDValue getVZextMovL(EVT VT, EVT OpVT,
SDValue SrcOp, SelectionDAG &DAG,
const X86Subtarget *Subtarget, DebugLoc dl) {
Owen Anderson
committed
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
Evan Cheng
committed
LoadSDNode *LD = NULL;
Gabor Greif
committed
if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
Evan Cheng
committed
LD = dyn_cast<LoadSDNode>(SrcOp);
if (!LD) {
// movssrr and movsdrr do not clear top bits. Try to use movd, movq
// instead.
if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
Evan Cheng
committed
SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
Wesley Peck
committed
SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
Evan Cheng
committed
// PR2108
Owen Anderson
committed
OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
OpVT,
Evan Cheng
committed
}
}
}
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
Wesley Peck
committed
DAG.getNode(ISD::BITCAST, dl,
Evan Cheng
committed
}
Evan Cheng
committed
/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
/// shuffles.
LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
Owen Anderson
committed
EVT VT = SVOp->getValueType(0);
Evan Cheng
committed
SmallVector<std::pair<int, int>, 8> Locs;
Locs.resize(4);
SmallVector<int, 8> Mask1(4U, -1);
SmallVector<int, 8> PermMask;
SVOp->getMask(PermMask);
Evan Cheng
committed
unsigned NumHi = 0;
unsigned NumLo = 0;
for (unsigned i = 0; i != 4; ++i) {
int Idx = PermMask[i];
if (Idx < 0) {
Evan Cheng
committed
Locs[i] = std::make_pair(-1, -1);
} else {
assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
if (Idx < 4) {
Evan Cheng
committed
Locs[i] = std::make_pair(0, NumLo);
Mask1[NumLo] = Idx;
Evan Cheng
committed
NumLo++;
} else {
Locs[i] = std::make_pair(1, NumHi);
if (2+NumHi < 4)
Mask1[2+NumHi] = Idx;
Evan Cheng
committed
NumHi++;
}
}
}
Evan Cheng
committed
if (NumLo <= 2 && NumHi <= 2) {
// If no more than two elements come from either vector. This can be
// implemented with two shuffles. First shuffle gather the elements.
// The second shuffle, which takes the first shuffle as both of its
// vector operands, put the elements into the right order.
V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
SmallVector<int, 8> Mask2(4U, -1);
Evan Cheng
committed
for (unsigned i = 0; i != 4; ++i) {
if (Locs[i].first == -1)
continue;
else {
unsigned Idx = (i < 2) ? 0 : 4;
Idx += Locs[i].first * 2 + Locs[i].second;
Mask2[i] = Idx;
Evan Cheng
committed
}
}
return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
} else if (NumLo == 3 || NumHi == 3) {
// Otherwise, we must have three elements from one vector, call it X, and
// one element from the other, call it Y. First, use a shufps to build an
// intermediate vector with the one element from Y and the element from X
// that will be in the same half in the final destination (the indexes don't
// matter). Then, use a shufps to build the final vector, taking the half
// containing the element from Y from the intermediate, and the other half
// from X.
if (NumHi == 3) {
// Normalize it so the 3 elements come from V1.
CommuteVectorShuffleMask(PermMask, VT);
std::swap(V1, V2);
}