Newer
Older
for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
MaskVec.push_back(DAG.getConstant(i, BaseVT));
MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT));
}
return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
&MaskVec[0], MaskVec.size());
/// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation
/// of specified width.
static SDValue getUnpackhMask(unsigned NumElems, SelectionDAG &DAG,
DebugLoc dl) {
MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
MVT BaseVT = MaskVT.getVectorElementType();
unsigned Half = NumElems/2;
for (unsigned i = 0; i != Half; ++i) {
MaskVec.push_back(DAG.getConstant(i + Half, BaseVT));
MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT));
}
return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
&MaskVec[0], MaskVec.size());
}
/// getSwapEltZeroMask - Returns a vector_shuffle mask for a shuffle that swaps
/// element #0 of a vector with the specified index, leaving the rest of the
/// elements in place.
static SDValue getSwapEltZeroMask(unsigned NumElems, unsigned DestElt,
SelectionDAG &DAG, DebugLoc dl) {
MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
MVT BaseVT = MaskVT.getVectorElementType();
// Element #0 of the result gets the elt we are replacing.
MaskVec.push_back(DAG.getConstant(DestElt, BaseVT));
for (unsigned i = 1; i != NumElems; ++i)
MaskVec.push_back(DAG.getConstant(i == DestElt ? 0 : i, BaseVT));
return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
&MaskVec[0], MaskVec.size());
Evan Cheng
committed
/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) {
MVT PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32;
MVT VT = Op.getValueType();
Evan Cheng
committed
if (PVT == VT)
return Op;
SDValue V1 = Op.getOperand(0);
SDValue Mask = Op.getOperand(2);
unsigned MaskNumElems = Mask.getNumOperands();
unsigned NumElems = MaskNumElems;
DebugLoc dl = Op.getDebugLoc();
Evan Cheng
committed
// Special handling of v4f32 -> v4i32.
if (VT != MVT::v4f32) {
// Find which element we want to splat.
SDNode* EltNoNode = getSplatMaskEltNo(Mask.getNode()).getNode();
unsigned EltNo = cast<ConstantSDNode>(EltNoNode)->getZExtValue();
// unpack elements to the correct location
Evan Cheng
committed
while (NumElems > 4) {
if (EltNo < NumElems/2) {
Mask = getUnpacklMask(MaskNumElems, DAG, dl);
Mask = getUnpackhMask(MaskNumElems, DAG, dl);
EltNo -= NumElems/2;
}
V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V1, Mask);
Evan Cheng
committed
NumElems >>= 1;
}
SDValue Cst = DAG.getConstant(EltNo, MVT::i32);
Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1,
DAG.getUNDEF(PVT), Mask);
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle);
Evan Cheng
committed
3078
3079
3080
3081
3082
3083
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
3099
3100
/// isVectorLoad - Returns true if the node is a vector load, a scalar
/// load that's promoted to vector, or a load bitcasted.
static bool isVectorLoad(SDValue Op) {
assert(Op.getValueType().isVector() && "Expected a vector type");
if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR ||
Op.getOpcode() == ISD::BIT_CONVERT) {
return isa<LoadSDNode>(Op.getOperand(0));
}
return isa<LoadSDNode>(Op);
}
/// CanonicalizeMovddup - Cannonicalize movddup shuffle to v2f64.
///
static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask,
SelectionDAG &DAG, bool HasSSE3) {
// If we have sse3 and shuffle has more than one use or input is a load, then
// use movddup. Otherwise, use movlhps.
bool UseMovddup = HasSSE3 && (!Op.hasOneUse() || isVectorLoad(V1));
MVT PVT = UseMovddup ? MVT::v2f64 : MVT::v4f32;
MVT VT = Op.getValueType();
if (VT == PVT)
return Op;
DebugLoc dl = Op.getDebugLoc();
Evan Cheng
committed
unsigned NumElems = PVT.getVectorNumElements();
if (NumElems == 2) {
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
Evan Cheng
committed
} else {
assert(NumElems == 4);
SDValue Cst0 = DAG.getTargetConstant(0, MVT::i32);
SDValue Cst1 = DAG.getTargetConstant(1, MVT::i32);
Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
Cst0, Cst1, Cst0, Cst1);
Evan Cheng
committed
}
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1,
DAG.getUNDEF(PVT), Mask);
return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle);
Evan Cheng
committed
}
/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
/// vector of zero or undef vector. This produces a shuffle where the low
/// element of V2 is swizzled into the zero/undef vector, landing at element
/// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
Evan Cheng
committed
bool isZero, bool HasSSE2,
SelectionDAG &DAG) {
DebugLoc dl = V2.getDebugLoc();
MVT VT = V2.getValueType();
? getZeroVector(VT, HasSSE2, DAG, dl) : DAG.getUNDEF(VT);
unsigned NumElems = V2.getValueType().getVectorNumElements();
MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
MVT EVT = MaskVT.getVectorElementType();
for (unsigned i = 0; i != NumElems; ++i)
if (i == Idx) // If this is the insertion idx, put the low elt of V2 here.
MaskVec.push_back(DAG.getConstant(NumElems, EVT));
else
MaskVec.push_back(DAG.getConstant(i, EVT));
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
&MaskVec[0], MaskVec.size());
return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, Mask);
}
/// getNumOfConsecutiveZeros - Return the number of elements in a result of
/// a shuffle that is zero.
static
unsigned getNumOfConsecutiveZeros(SDValue Op, SDValue Mask,
unsigned NumElems, bool Low,
SelectionDAG &DAG) {
unsigned NumZeros = 0;
for (unsigned i = 0; i < NumElems; ++i) {
unsigned Index = Low ? i : NumElems-i-1;
if (Idx.getOpcode() == ISD::UNDEF) {
++NumZeros;
continue;
}
Gabor Greif
committed
SDValue Elt = DAG.getShuffleScalarElt(Op.getNode(), Index);
if (Elt.getNode() && isZeroNode(Elt))
++NumZeros;
else
break;
}
return NumZeros;
}
/// isVectorShift - Returns true if the shuffle can be implemented as a
/// logical left or right shift of a vector.
static bool isVectorShift(SDValue Op, SDValue Mask, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
unsigned NumElems = Mask.getNumOperands();
isLeft = true;
unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG);
if (!NumZeros) {
isLeft = false;
NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG);
if (!NumZeros)
return false;
}
bool SeenV1 = false;
bool SeenV2 = false;
for (unsigned i = NumZeros; i < NumElems; ++i) {
unsigned Val = isLeft ? (i - NumZeros) : i;
SDValue Idx = Mask.getOperand(isLeft ? i : (i - NumZeros));
if (Idx.getOpcode() == ISD::UNDEF)
continue;
unsigned Index = cast<ConstantSDNode>(Idx)->getZExtValue();
if (Index < NumElems)
SeenV1 = true;
else {
Index -= NumElems;
SeenV2 = true;
}
if (Index != Val)
return false;
}
if (SeenV1 && SeenV2)
return false;
ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1);
ShAmt = NumZeros;
return true;
}
/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
///
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
if (NumNonZero > 8)
DebugLoc dl = Op.getDebugLoc();
bool First = true;
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
if (ThisIsNonZero && First) {
if (NumZero)
V = getZeroVector(MVT::v8i16, true, DAG, dl);
V = DAG.getUNDEF(MVT::v8i16);
First = false;
}
if ((i & 1) != 0) {
bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
if (LastIsNonZero) {
LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
MVT::i16, Op.getOperand(i-1));
}
if (ThisIsNonZero) {
ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
ThisElt, DAG.getConstant(8, MVT::i8));
if (LastIsNonZero)
ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
} else
ThisElt = LastElt;
Gabor Greif
committed
if (ThisElt.getNode())
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
DAG.getIntPtrConstant(i/2));
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
if (NumNonZero > 4)
DebugLoc dl = Op.getDebugLoc();
bool First = true;
for (unsigned i = 0; i < 8; ++i) {
bool isNonZero = (NonZeros & (1 << i)) != 0;
if (isNonZero) {
if (First) {
if (NumZero)
V = getZeroVector(MVT::v8i16, true, DAG, dl);
V = DAG.getUNDEF(MVT::v8i16);
First = false;
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
MVT::v8i16, V, Op.getOperand(i),
DAG.getIntPtrConstant(i));
}
}
return V;
}
/// getVShift - Return a vector logical shift node.
///
static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, DebugLoc dl) {
bool isMMX = VT.getSizeInBits() == 64;
MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
DAG.getNode(Opc, dl, ShVT, SrcOp,
DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
}
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
// All zero's are handled with pxor, all one's are handled with pcmpeqd.
if (ISD::isBuildVectorAllZeros(Op.getNode())
|| ISD::isBuildVectorAllOnes(Op.getNode())) {
// Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
// 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
// eliminated on x86-32 hosts.
if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
return Op;
Gabor Greif
committed
if (ISD::isBuildVectorAllOnes(Op.getNode()))
return getOnesVector(Op.getValueType(), DAG, dl);
return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
MVT VT = Op.getValueType();
MVT EVT = VT.getVectorElementType();
unsigned EVTBits = EVT.getSizeInBits();
unsigned NumElems = Op.getNumOperands();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
unsigned NonZeros = 0;
bool IsAllConstants = true;
for (unsigned i = 0; i < NumElems; ++i) {
Evan Cheng
committed
if (Elt.getOpcode() == ISD::UNDEF)
continue;
Values.insert(Elt);
if (Elt.getOpcode() != ISD::Constant &&
Elt.getOpcode() != ISD::ConstantFP)
IsAllConstants = false;
Evan Cheng
committed
if (isZeroNode(Elt))
NumZero++;
else {
NonZeros |= (1 << i);
NumNonZero++;
}
}
if (NumNonZero == 0) {
// All undef vector. Return an UNDEF. All zero vectors were handled above.
return DAG.getUNDEF(VT);
}
// Special case for single non-zero, non-undef, element.
Evan Cheng
committed
if (NumNonZero == 1 && NumElems <= 4) {
unsigned Idx = CountTrailingZeros_32(NonZeros);
// If this is an insertion of an i64 value on x86-32, and if the top bits of
// the value are obviously zero, truncate the value to i32 and do the
// insertion that way. Only do this if the value is non-constant or if the
// value is a constant being inserted into element 0. It is cheaper to do
// a constant pool load than it is to do a movd + shuffle.
if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
(!IsAllConstants || Idx == 0)) {
if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
// Handle MMX and SSE both.
MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
// Truncate the value (which may itself be a constant) to i32, and
// convert it to a vector with movd (S2V+shuffle to zero extend).
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
Evan Cheng
committed
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
Subtarget->hasSSE2(), DAG);
// Now we have our 32-bit value zero extended in the low element of
// a vector. If Idx != 0, swizzle it into place.
if (Idx != 0) {
SDValue Ops[] = {
Item, DAG.getUNDEF(Item.getValueType()),
getSwapEltZeroMask(VecElts, Idx, DAG, dl)
Item = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VecVT, Ops, 3);
return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
// If we have a constant or non-constant insertion into the low element of
// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
// the rest of the elements. This will be matched as movd/movq/movss/movsd
// depending on what the source datatype is. Because we can only get here
// when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
if (Idx == 0 &&
// Don't do this for i64 values on x86-32.
(EVT != MVT::i64 || Subtarget->is64Bit())) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
Evan Cheng
committed
return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
Subtarget->hasSSE2(), DAG);
// Is it a vector logical left shift?
if (NumElems == 2 && Idx == 1 &&
isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
unsigned NumBits = VT.getSizeInBits();
return getVShift(true, VT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
VT, Op.getOperand(1)),
}
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
// Otherwise, if this is a vector with i32 or f32 elements, and the element
// is a non-constant being inserted into an element other than the low one,
// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
// movd/movss) to move this into the low element, then shuffle it into
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a shuffle of zero and zero-extended scalar to vector.
Evan Cheng
committed
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
Subtarget->hasSSE2(), DAG);
MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
MVT MaskEVT = MaskVT.getVectorElementType();
for (unsigned i = 0; i < NumElems; i++)
MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
&MaskVec[0], MaskVec.size());
return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, Item,
DAG.getUNDEF(VT), Mask);
}
}
// Splat is obviously ok. Let legalizer expand it to a shuffle.
if (Values.size() == 1)
// A vector full of immediates; various special cases are already
// handled, so this is best done with a single constant-pool load.
if (IsAllConstants)
// Let legalizer expand 2-wide build_vectors.
Evan Cheng
committed
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
unsigned Idx = CountTrailingZeros_32(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Evan Cheng
committed
Op.getOperand(Idx));
Evan Cheng
committed
return getShuffleVectorZeroOrUndef(V2, Idx, true,
Subtarget->hasSSE2(), DAG);
Evan Cheng
committed
}
Evan Cheng
committed
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16) {
SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
Gabor Greif
committed
if (V.getNode()) return V;
if (EVTBits == 16 && NumElems == 8) {
SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
Gabor Greif
committed
if (V.getNode()) return V;
}
// If element VT is == 32 bits, turn it into a number of shuffles.
V.resize(NumElems);
if (NumElems == 4 && NumZero > 0) {
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1 << i));
if (isZero)
V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
for (unsigned i = 0; i < 2; ++i) {
switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
default: break;
case 0:
V[i] = V[i*2]; // Must be a zero vector.
break;
case 1:
V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2+1], V[i*2],
getMOVLMask(NumElems, DAG, dl));
break;
case 2:
V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1],
getMOVLMask(NumElems, DAG, dl));
break;
case 3:
V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1],
getUnpacklMask(NumElems, DAG, dl));
break;
}
}
MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
MVT EVT = MaskVT.getVectorElementType();
bool Reverse = (NonZeros & 0x3) == 2;
for (unsigned i = 0; i < 2; ++i)
if (Reverse)
MaskVec.push_back(DAG.getConstant(1-i, EVT));
else
MaskVec.push_back(DAG.getConstant(i, EVT));
Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
for (unsigned i = 0; i < 2; ++i)
if (Reverse)
MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
else
MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
&MaskVec[0], MaskVec.size());
return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[0], V[1], ShufMask);
}
if (Values.size() > 2) {
// Expand into a number of unpckl*.
// e.g. for v4f32
// Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
// : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
// Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
SDValue UnpckMask = getUnpacklMask(NumElems, DAG, dl);
for (unsigned i = 0; i < NumElems; ++i)
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
NumElems >>= 1;
while (NumElems != 0) {
for (unsigned i = 0; i < NumElems; ++i)
V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i], V[i + NumElems],
UnpckMask);
NumElems >>= 1;
}
return V[0];
}
// v8i16 shuffles - Prefer shuffles in the following order:
// 1. [all] pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
// 3. [ssse3] 2 x pshufb + 1 x por
// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
SDValue PermMask, SelectionDAG &DAG,
X86TargetLowering &TLI, DebugLoc dl) {
Gabor Greif
committed
SmallVector<SDValue, 8> MaskElts(PermMask.getNode()->op_begin(),
PermMask.getNode()->op_end());
SmallVector<int, 8> MaskVals;
// Determine if more than 1 of the words in each of the low and high quadwords
// of the result come from the same quadword of one of the two inputs. Undef
// mask values count as coming from any quadword, for better codegen.
SmallVector<unsigned, 4> LoQuad(4);
SmallVector<unsigned, 4> HiQuad(4);
BitVector InputQuads(4);
for (unsigned i = 0; i < 8; ++i) {
SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 :
cast<ConstantSDNode>(Elt)->getZExtValue();
MaskVals.push_back(EltIdx);
if (EltIdx < 0) {
++Quad[0];
++Quad[1];
++Quad[2];
++Quad[3];
continue;
}
++Quad[EltIdx / 4];
InputQuads.set(EltIdx / 4);
}
unsigned MaxQuad = 1;
for (unsigned i = 0; i < 4; ++i) {
if (LoQuad[i] > MaxQuad) {
BestLoQuad = i;
MaxQuad = LoQuad[i];
}
}
MaxQuad = 1;
for (unsigned i = 0; i < 4; ++i) {
if (HiQuad[i] > MaxQuad) {
BestHiQuad = i;
MaxQuad = HiQuad[i];
}
}
// For SSSE3, If all 8 words of the result come from only 1 quadword of each
// of the two input vectors, shuffle them into one input vector so only a
// single pshufb instruction is necessary. If There are more than 2 input
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
if (TLI.getSubtarget()->hasSSSE3()) {
if (InputQuads.count() == 2 && V1Used && V2Used) {
BestLoQuad = InputQuads.find_first();
BestHiQuad = InputQuads.find_next(BestLoQuad);
}
if (InputQuads.count() > 2) {
BestLoQuad = -1;
BestHiQuad = -1;
}
}
// If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
// the shuffle mask. If a quad is scored as -1, that means that it contains
// words from all 4 input quadwords.
SDValue NewV;
if (BestLoQuad >= 0 || BestHiQuad >= 0) {
SmallVector<SDValue,8> MaskV;
MaskV.push_back(DAG.getConstant(BestLoQuad < 0 ? 0 : BestLoQuad, MVT::i64));
MaskV.push_back(DAG.getConstant(BestHiQuad < 0 ? 1 : BestHiQuad, MVT::i64));
SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, &MaskV[0], 2);
NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64,
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask);
NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
// Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
// source words for the shuffle, to aid later transformations.
bool AllWordsInNewV = true;
Mon P Wang
committed
bool InOrder[2] = { true, true };
for (unsigned i = 0; i != 8; ++i) {
int idx = MaskVals[i];
Mon P Wang
committed
if (idx != (int)i)
InOrder[i/4] = false;
if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
continue;
AllWordsInNewV = false;
break;
}
bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
if (AllWordsInNewV) {
for (int i = 0; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0)
continue;
idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
if ((idx != i) && idx < 4)
pshufhw = false;
if ((idx != i) && idx > 3)
pshuflw = false;
}
V1 = NewV;
V2Used = false;
BestLoQuad = 0;
BestHiQuad = 1;
}
// If we've eliminated the use of V2, and the new mask is a pshuflw or
// pshufhw, that's as cheap as it gets. Return the new shuffle.
Mon P Wang
committed
if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
MaskV.clear();
for (unsigned i = 0; i != 8; ++i)
MaskV.push_back((MaskVals[i] < 0) ? DAG.getUNDEF(MVT::i16)
: DAG.getConstant(MaskVals[i],
MVT::i16));
return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV,
DAG.getUNDEF(MVT::v8i16),
DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16,
&MaskV[0], 8));
}
}
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
if (TLI.getSubtarget()->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If we have elements from both input vectors, set the high bit of the
// shuffle mask element to zero out elements that come from V2 in the V1
// mask, and elements that come from V1 in the V2 mask, so that the two
// results can be OR'd together.
bool TwoInputs = V1Used && V2Used;
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
if (TwoInputs && (EltIdx >= 16)) {
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
}
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
}
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v16i8, &pshufbMask[0], 16));
if (!TwoInputs)
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
pshufbMask.clear();
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
if (EltIdx < 16) {
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
Bill Wendling
committed
continue;
}
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
}
V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v16i8, &pshufbMask[0], 16));
V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
}
// If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
// and update MaskVals with new element order.
BitVector InOrder(8);
if (BestLoQuad >= 0) {
SmallVector<SDValue, 8> MaskV;
for (int i = 0; i != 4; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
MaskV.push_back(DAG.getUNDEF(MVT::i16));
InOrder.set(i);
} else if ((idx / 4) == BestLoQuad) {
MaskV.push_back(DAG.getConstant(idx & 3, MVT::i16));
InOrder.set(i);
} else {
MaskV.push_back(DAG.getUNDEF(MVT::i16));
}
}
for (unsigned i = 4; i != 8; ++i)
MaskV.push_back(DAG.getConstant(i, MVT::i16));
NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV,
DAG.getUNDEF(MVT::v8i16),
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v8i16, &MaskV[0], 8));
}
// If BestHi >= 0, generate a pshufhw to put the high elements in order,
// and update MaskVals with the new element order.
if (BestHiQuad >= 0) {
SmallVector<SDValue, 8> MaskV;
for (unsigned i = 0; i != 4; ++i)
MaskV.push_back(DAG.getConstant(i, MVT::i16));
for (unsigned i = 4; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
MaskV.push_back(DAG.getUNDEF(MVT::i16));
InOrder.set(i);
} else if ((idx / 4) == BestHiQuad) {
MaskV.push_back(DAG.getConstant((idx & 3) + 4, MVT::i16));
InOrder.set(i);
} else {
MaskV.push_back(DAG.getUNDEF(MVT::i16));
}
NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV,
DAG.getUNDEF(MVT::v8i16),
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v8i16, &MaskV[0], 8));
3773
3774
3775
3776
3777
3778
3779
3780
3781
3782
3783
3784
3785
3786
3787
3788
3789
3790
3791
3792
3793
3794
3795
3796
3797
3798
}
// In case BestHi & BestLo were both -1, which means each quadword has a word
// from each of the four input quadwords, calculate the InOrder bitvector now
// before falling through to the insert/extract cleanup.
if (BestLoQuad == -1 && BestHiQuad == -1) {
NewV = V1;
for (int i = 0; i != 8; ++i)
if (MaskVals[i] < 0 || MaskVals[i] == i)
InOrder.set(i);
}
// The other elements are put in the right place using pextrw and pinsrw.
for (unsigned i = 0; i != 8; ++i) {
if (InOrder[i])
continue;
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
SDValue ExtOp = (EltIdx < 8)
? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
DAG.getIntPtrConstant(EltIdx))
: DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
DAG.getIntPtrConstant(EltIdx - 8));
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
DAG.getIntPtrConstant(i));
3803
3804
3805
3806
3807
3808
3809
3810
3811
3812
3813
3814
3815
3816
3817
3818
3819
3820
3821
3822
3823
3824
3825
3826
3827
3828
3829
3830
3831
3832
3833
3834
3835
3836
3837
3838
3839
3840
3841
3842
3843
3844
3845
3846
3847
3848
3849
// v16i8 shuffles - Prefer shuffles in the following order:
// 1. [ssse3] 1 x pshufb
// 2. [ssse3] 2 x pshufb + 1 x por
// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
static
SDValue LowerVECTOR_SHUFFLEv16i8(SDValue V1, SDValue V2,
SDValue PermMask, SelectionDAG &DAG,
X86TargetLowering &TLI, DebugLoc dl) {
SmallVector<SDValue, 16> MaskElts(PermMask.getNode()->op_begin(),
PermMask.getNode()->op_end());
SmallVector<int, 16> MaskVals;
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
// present, fall back to case 3.
// FIXME: kill V2Only once shuffles are canonizalized by getNode.
bool V1Only = true;
bool V2Only = true;
for (unsigned i = 0; i < 16; ++i) {
SDValue Elt = MaskElts[i];
int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 :
cast<ConstantSDNode>(Elt)->getZExtValue();
MaskVals.push_back(EltIdx);
if (EltIdx < 0)
continue;
if (EltIdx < 16)
V2Only = false;
else
V1Only = false;
}
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
if (TLI.getSubtarget()->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If all result elements are from one input vector, then only translate
// undef mask values to 0x80 (zero out result) in the pshufb mask.
//
// Otherwise, we have elements from both input vectors, and must zero out
// elements that come from V2 in the first mask, and V1 in the second mask
// so that we can OR them together.
bool TwoInputs = !(V1Only || V2Only);
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
}
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
}
// If all the elements are from V2, assign it to V1 and return after
// building the first pshufb.
if (V2Only)
V1 = V2;
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v16i8, &pshufbMask[0], 16));
if (!TwoInputs)
return V1;
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
pshufbMask.clear();
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 16) {
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
}
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
MVT::v16i8, &pshufbMask[0], 16));
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
}
// No SSSE3 - Calculate in place words and then fix all out of place words
// With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
// the 16 different words that comprise the two doublequadword input vectors.
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
SDValue NewV = V2Only ? V2 : V1;
for (int i = 0; i != 8; ++i) {
int Elt0 = MaskVals[i*2];
int Elt1 = MaskVals[i*2+1];
// This word of the result is all undef, skip it.
if (Elt0 < 0 && Elt1 < 0)
continue;
// This word of the result is already in the correct place, skip it.
if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
continue;
if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
continue;
SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
SDValue InsElt;
Mon P Wang
committed
// If Elt0 and Elt1 are defined, are consecutive, and can be load
// using a single extract together, load it and store it.
if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
DAG.getIntPtrConstant(Elt1 / 2));
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
DAG.getIntPtrConstant(i));
continue;
}
// If Elt1 is defined, extract it from the appropriate source. If the
Mon P Wang
committed
// source byte is not also odd, shift the extracted word left 8 bits
// otherwise clear the bottom 8 bits if we need to do an or.
if (Elt1 >= 0) {
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
DAG.getIntPtrConstant(Elt1 / 2));
if ((Elt1 & 1) == 0)
InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
DAG.getConstant(8, TLI.getShiftAmountTy()));
Mon P Wang
committed
else if (Elt0 >= 0)
InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
DAG.getConstant(0xFF00, MVT::i16));
}
// If Elt0 is defined, extract it from the appropriate source. If the
// source byte is not also even, shift the extracted word right 8 bits. If
// Elt1 was also defined, OR the extracted values together before
// inserting them in the result.
if (Elt0 >= 0) {
SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
if ((Elt0 & 1) != 0)
InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
DAG.getConstant(8, TLI.getShiftAmountTy()));
Mon P Wang
committed
else if (Elt1 >= 0)
InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
DAG.getConstant(0x00FF, MVT::i16));
InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
: InsElt0;
}
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
DAG.getIntPtrConstant(i));
return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
}
Evan Cheng
committed
/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
/// done when every pair / quad of shuffle mask elements point to elements in
/// the right sequence. e.g.
/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
static
SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2,
TargetLowering &TLI, DebugLoc dl) {
unsigned NumElems = PermMask.getNumOperands();
Evan Cheng
committed
unsigned NewWidth = (NumElems == 4) ? 2 : 4;
MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
MVT MaskEltVT = MaskVT.getVectorElementType();
MVT NewVT = MaskVT;
switch (VT.getSimpleVT()) {
default: assert(false && "Unexpected!");
Evan Cheng
committed
case MVT::v4f32: NewVT = MVT::v2f64; break;
case MVT::v4i32: NewVT = MVT::v2i64; break;
case MVT::v8i16: NewVT = MVT::v4i32; break;
case MVT::v16i8: NewVT = MVT::v4i32; break;
}
if (NewWidth == 2) {
Evan Cheng
committed
NewVT = MVT::v2i64;
else
NewVT = MVT::v2f64;
Evan Cheng
committed
unsigned Scale = NumElems / NewWidth;
for (unsigned i = 0; i < NumElems; i += Scale) {
unsigned StartIdx = ~0U;
for (unsigned j = 0; j < Scale; ++j) {
if (Elt.getOpcode() == ISD::UNDEF)
continue;
unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
if (StartIdx == ~0U)
StartIdx = EltIdx - (EltIdx % Scale);
if (EltIdx != StartIdx + j)
}
if (StartIdx == ~0U)
MaskVec.push_back(DAG.getUNDEF(MaskEltVT));
else
MaskVec.push_back(DAG.getConstant(StartIdx / Scale, MaskEltVT));
V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, NewVT, V1, V2,