Newer
Older
if (Idx < NumElems)
SeenV1 = true;
SeenV2 = true;
// Only accept consecutive elements from the same vector
if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
return false;
}
OpNum = SeenV1 ? 0 : 1;
return true;
}
/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
/// logical left shift of a vector.
static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
false /* check zeros from right */, DAG);
unsigned OpSrc;
if (!NumZeros)
return false;
// Considering the elements in the mask that are not consecutive zeros,
// check if they consecutively come from only one of the source vectors.
//
// V1 = {X, A, B, C} 0
// \ \ \ /
// vector_shuffle V1, V2 <1, 2, 3, X>
//
if (!isShuffleMaskConsecutive(SVOp,
0, // Mask Start Index
NumElems-NumZeros-1, // Mask End Index
NumZeros, // Where to start looking in the src vector
NumElems, // Number of elements in vector
OpSrc)) // Which source operand ?
return false;
isLeft = false;
ShAmt = NumZeros;
ShVal = SVOp->getOperand(OpSrc);
return true;
}
4049
4050
4051
4052
4053
4054
4055
4056
4057
4058
4059
4060
4061
4062
4063
4064
4065
4066
4067
4068
4069
4070
4071
4072
4073
4074
4075
4076
4077
4078
4079
4080
4081
4082
4083
4084
4085
4086
4087
4088
4089
4090
4091
/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
/// logical left shift of a vector.
static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
true /* check zeros from left */, DAG);
unsigned OpSrc;
if (!NumZeros)
return false;
// Considering the elements in the mask that are not consecutive zeros,
// check if they consecutively come from only one of the source vectors.
//
// 0 { A, B, X, X } = V2
// / \ / /
// vector_shuffle V1, V2 <X, X, 4, 5>
//
if (!isShuffleMaskConsecutive(SVOp,
NumZeros, // Mask Start Index
NumElems-1, // Mask End Index
0, // Where to start looking in the src vector
NumElems, // Number of elements in vector
OpSrc)) // Which source operand ?
return false;
isLeft = true;
ShAmt = NumZeros;
ShVal = SVOp->getOperand(OpSrc);
return true;
}
/// isVectorShift - Returns true if the shuffle can be implemented as a
/// logical left or right shift of a vector.
static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
return true;
return false;
}
/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
///
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const TargetLowering &TLI) {
if (NumNonZero > 8)
DebugLoc dl = Op.getDebugLoc();
bool First = true;
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
if (ThisIsNonZero && First) {
if (NumZero)
Owen Anderson
committed
V = getZeroVector(MVT::v8i16, true, DAG, dl);
Owen Anderson
committed
V = DAG.getUNDEF(MVT::v8i16);
First = false;
}
if ((i & 1) != 0) {
bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
if (LastIsNonZero) {
LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
Owen Anderson
committed
MVT::i16, Op.getOperand(i-1));
}
if (ThisIsNonZero) {
Owen Anderson
committed
ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
ThisElt, DAG.getConstant(8, MVT::i8));
if (LastIsNonZero)
Owen Anderson
committed
ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
} else
ThisElt = LastElt;
Gabor Greif
committed
if (ThisElt.getNode())
Owen Anderson
committed
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
DAG.getIntPtrConstant(i/2));
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
SelectionDAG &DAG,
const TargetLowering &TLI) {
if (NumNonZero > 4)
DebugLoc dl = Op.getDebugLoc();
bool First = true;
for (unsigned i = 0; i < 8; ++i) {
bool isNonZero = (NonZeros & (1 << i)) != 0;
if (isNonZero) {
if (First) {
if (NumZero)
Owen Anderson
committed
V = getZeroVector(MVT::v8i16, true, DAG, dl);
Owen Anderson
committed
V = DAG.getUNDEF(MVT::v8i16);
First = false;
}
V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
Owen Anderson
committed
MVT::v8i16, V, Op.getOperand(i),
DAG.getIntPtrConstant(i));
}
}
return V;
}
/// getVShift - Return a vector logical shift node.
///
Owen Anderson
committed
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
unsigned NumBits, SelectionDAG &DAG,
const TargetLowering &TLI, DebugLoc dl) {
unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
Wesley Peck
committed
SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(Opc, dl, ShVT, SrcOp,
DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
}
Evan Cheng
committed
SDValue
X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
SelectionDAG &DAG) const {
Evan Cheng
committed
// Check if the scalar load can be widened into a vector load. And if
// the address is "base + cst" see if the cst can be "absorbed" into
// the shuffle mask.
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
SDValue Ptr = LD->getBasePtr();
if (!ISD::isNormalLoad(LD) || LD->isVolatile())
return SDValue();
EVT PVT = LD->getValueType(0);
if (PVT != MVT::i32 && PVT != MVT::f32)
return SDValue();
int FI = -1;
int64_t Offset = 0;
if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
FI = FINode->getIndex();
Offset = 0;
} else if (DAG.isBaseWithConstantOffset(Ptr) &&
Evan Cheng
committed
isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
Offset = Ptr.getConstantOperandVal(1);
Ptr = Ptr.getOperand(0);
} else {
return SDValue();
}
SDValue Chain = LD->getChain();
// Make sure the stack object alignment is at least 16.
MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
if (DAG.InferPtrAlignment(Ptr) < 16) {
if (MFI->isFixedObjectIndex(FI)) {
// Can't change the alignment. FIXME: It's possible to compute
// the exact stack offset and reference FI + adjust offset instead.
// If someone *really* cares about this. That's the way to implement it.
return SDValue();
Evan Cheng
committed
} else {
MFI->setObjectAlignment(FI, 16);
}
}
// (Offset % 16) must be multiple of 4. Then address is then
// Ptr + (Offset & ~15).
if (Offset < 0)
return SDValue();
if ((Offset % 16) & 3)
return SDValue();
int64_t StartOffset = Offset & ~15;
if (StartOffset)
Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
int EltNo = (Offset - StartOffset) >> 2;
int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,
LD->getPointerInfo().getWithOffset(StartOffset),
Evan Cheng
committed
// Canonicalize it to a v4i32 shuffle.
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
return DAG.getNode(ISD::BITCAST, dl, VT,
Evan Cheng
committed
DAG.getVectorShuffle(MVT::v4i32, dl, V1,
DAG.getUNDEF(MVT::v4i32),&Mask[0]));
Evan Cheng
committed
}
return SDValue();
}
/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
/// vector of type 'VT', see if the elements can be replaced by a single large
/// load which has the same value as a build_vector whose operands are 'elts'.
///
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
Nate Begeman
committed
static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
DebugLoc &DL, SelectionDAG &DAG) {
Nate Begeman
committed
EVT EltVT = VT.getVectorElementType();
unsigned NumElems = Elts.size();
Nate Begeman
committed
LoadSDNode *LDBase = NULL;
unsigned LastLoadedElt = -1U;
// For each element in the initializer, see if we've found a load or an undef.
// If we don't find an initial load element, or later load elements are
Nate Begeman
committed
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Elts[i];
Nate Begeman
committed
if (!Elt.getNode() ||
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
return SDValue();
if (!LDBase) {
if (Elt.getNode()->getOpcode() == ISD::UNDEF)
return SDValue();
LDBase = cast<LoadSDNode>(Elt.getNode());
LastLoadedElt = i;
continue;
}
if (Elt.getOpcode() == ISD::UNDEF)
continue;
LoadSDNode *LD = cast<LoadSDNode>(Elt);
if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
return SDValue();
LastLoadedElt = i;
}
// If we have found an entire vector of loads and undefs, then return a large
// load of the entire vector width starting at the base pointer. If we found
// consecutive loads for the low half, generate a vzext_load node.
Nate Begeman
committed
if (LastLoadedElt == NumElems - 1) {
if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(),
Nate Begeman
committed
LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(),
Nate Begeman
committed
LDBase->isVolatile(), LDBase->isNonTemporal(),
LDBase->getAlignment());
} else if (NumElems == 4 && LastLoadedElt == 1) {
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
Ops, 2, MVT::i32,
LDBase->getMemOperand());
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
Nate Begeman
committed
}
return SDValue();
}
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
4323
4324
4325
4326
4327
4328
4329
4330
4331
4332
4333
4334
4335
4336
4337
4338
4339
4340
4341
4342
4343
4344
4345
4346
4347
4348
4349
4350
EVT VT = Op.getValueType();
EVT ExtVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
// For AVX-length vectors, build the individual 128-bit pieces and
// use shuffles to put them in place.
if (VT.getSizeInBits() > 256 &&
Subtarget->hasAVX() &&
!Disable256Bit &&
!ISD::isBuildVectorAllZeros(Op.getNode())) {
SmallVector<SDValue, 8> V;
V.resize(NumElems);
for (unsigned i = 0; i < NumElems; ++i) {
V[i] = Op.getOperand(i);
}
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
// Build the lower subvector.
SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
// Build the upper subvector.
SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
NumElems/2);
return ConcatVectors(Lower, Upper, DAG);
}
// All zero's are handled with pxor in SSE2 and above, xorps in SSE1.
// All one's are handled with pcmpeqd. In AVX, zero's are handled with
Bruno Cardoso Lopes
committed
// vpxor in 128-bit and xor{pd,ps} in 256-bit, but no 256 version of pcmpeqd
// is present, so AllOnes is ignored.
if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
(Op.getValueType().getSizeInBits() != 256 &&
ISD::isBuildVectorAllOnes(Op.getNode()))) {
// 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
// eliminated on x86-32 hosts.
Gabor Greif
committed
if (ISD::isBuildVectorAllOnes(Op.getNode()))
return getOnesVector(Op.getValueType(), DAG, dl);
return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
Owen Anderson
committed
unsigned EVTBits = ExtVT.getSizeInBits();
unsigned NumZero = 0;
unsigned NumNonZero = 0;
unsigned NonZeros = 0;
bool IsAllConstants = true;
for (unsigned i = 0; i < NumElems; ++i) {
Evan Cheng
committed
if (Elt.getOpcode() == ISD::UNDEF)
continue;
Values.insert(Elt);
if (Elt.getOpcode() != ISD::Constant &&
Elt.getOpcode() != ISD::ConstantFP)
IsAllConstants = false;
Evan Cheng
committed
if (X86::isZeroNode(Elt))
Evan Cheng
committed
NumZero++;
else {
NonZeros |= (1 << i);
NumNonZero++;
}
}
// All undef vector. Return an UNDEF. All zero vectors were handled above.
if (NumNonZero == 0)
return DAG.getUNDEF(VT);
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
unsigned Idx = CountTrailingZeros_32(NonZeros);
// If this is an insertion of an i64 value on x86-32, and if the top bits of
// the value are obviously zero, truncate the value to i32 and do the
// insertion that way. Only do this if the value is non-constant or if the
// value is a constant being inserted into element 0. It is cheaper to do
// a constant pool load than it is to do a movd + shuffle.
Owen Anderson
committed
if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
(!IsAllConstants || Idx == 0)) {
if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
// Handle SSE only.
assert(VT == MVT::v2i64 && "Expected an SSE value type!");
EVT VecVT = MVT::v4i32;
unsigned VecElts = 4;
// Truncate the value (which may itself be a constant) to i32, and
// convert it to a vector with movd (S2V+shuffle to zero extend).
Owen Anderson
committed
Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
Evan Cheng
committed
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
Subtarget->hasSSE2(), DAG);
// Now we have our 32-bit value zero extended in the low element of
// a vector. If Idx != 0, swizzle it into place.
if (Idx != 0) {
SmallVector<int, 4> Mask;
Mask.push_back(Idx);
for (unsigned i = 1; i != VecElts; ++i)
Mask.push_back(i);
Item = DAG.getVectorShuffle(VecVT, dl, Item,
&Mask[0]);
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item);
// If we have a constant or non-constant insertion into the low element of
// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
// the rest of the elements. This will be matched as movd/movq/movss/movsd
// depending on what the source datatype is.
if (Idx == 0) {
if (NumZero == 0) {
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
Owen Anderson
committed
} else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
DAG);
Owen Anderson
committed
} else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
EVT MiddleVT = MVT::v4i32;
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
Item = getShuffleVectorZeroOrUndef(Item, 0, true,
Subtarget->hasSSE2(), DAG);
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, VT, Item);
// Is it a vector logical left shift?
if (NumElems == 2 && Idx == 1 &&
Evan Cheng
committed
X86::isZeroNode(Op.getOperand(0)) &&
!X86::isZeroNode(Op.getOperand(1))) {
unsigned NumBits = VT.getSizeInBits();
return getVShift(true, VT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
VT, Op.getOperand(1)),
}
if (IsAllConstants) // Otherwise, it's better to do a constpool load.
// Otherwise, if this is a vector with i32 or f32 elements, and the element
// is a non-constant being inserted into an element other than the low one,
// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
// movd/movss) to move this into the low element, then shuffle it into
// place.
if (EVTBits == 32) {
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a shuffle of zero and zero-extended scalar to vector.
Evan Cheng
committed
Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
Subtarget->hasSSE2(), DAG);
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i < NumElems; i++)
MaskVec.push_back(i == Idx ? 0 : 1);
return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
}
}
// Splat is obviously ok. Let legalizer expand it to a shuffle.
Evan Cheng
committed
if (Values.size() == 1) {
if (EVTBits == 32) {
// Instead of a shuffle like this:
// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
// Check if it's possible to issue this instead.
// shuffle (vload ptr)), undef, <1, 1, 1, 1>
unsigned Idx = CountTrailingZeros_32(NonZeros);
SDValue Item = Op.getOperand(Idx);
if (Op.getNode()->isOnlyUserOf(Item.getNode()))
return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
}
Evan Cheng
committed
}
// A vector full of immediates; various special cases are already
// handled, so this is best done with a single constant-pool load.
if (IsAllConstants)
// Let legalizer expand 2-wide build_vectors.
Evan Cheng
committed
if (EVTBits == 64) {
if (NumNonZero == 1) {
// One half is zero or undef.
unsigned Idx = CountTrailingZeros_32(NonZeros);
SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
Evan Cheng
committed
Op.getOperand(Idx));
Evan Cheng
committed
return getShuffleVectorZeroOrUndef(V2, Idx, true,
Subtarget->hasSSE2(), DAG);
Evan Cheng
committed
}
Evan Cheng
committed
}
// If element VT is < 32 bits, convert it to inserts into a zero vector.
if (EVTBits == 8 && NumElems == 16) {
SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
Gabor Greif
committed
if (V.getNode()) return V;
if (EVTBits == 16 && NumElems == 8) {
SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
Gabor Greif
committed
if (V.getNode()) return V;
}
// If element VT is == 32 bits, turn it into a number of shuffles.
V.resize(NumElems);
if (NumElems == 4 && NumZero > 0) {
for (unsigned i = 0; i < 4; ++i) {
bool isZero = !(NonZeros & (1 << i));
if (isZero)
V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
}
for (unsigned i = 0; i < 2; ++i) {
switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
default: break;
case 0:
V[i] = V[i*2]; // Must be a zero vector.
break;
case 1:
V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
break;
case 2:
V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
break;
case 3:
V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
break;
}
}
SmallVector<int, 8> MaskVec;
bool Reverse = (NonZeros & 0x3) == 2;
for (unsigned i = 0; i < 2; ++i)
MaskVec.push_back(Reverse ? 1-i : i);
Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
for (unsigned i = 0; i < 2; ++i)
MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
Nate Begeman
committed
if (Values.size() > 1 && VT.getSizeInBits() == 128) {
// Check for a build vector of consecutive loads.
for (unsigned i = 0; i < NumElems; ++i)
V[i] = Op.getOperand(i);
Nate Begeman
committed
// Check for elements which are consecutive loads.
SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
if (LD.getNode())
return LD;
// For SSE 4.1, use insertps to put the high elements into the low element.
Nate Begeman
committed
if (getSubtarget()->hasSSE41()) {
SDValue Result;
if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
else
Result = DAG.getUNDEF(VT);
for (unsigned i = 1; i < NumElems; ++i) {
if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
Op.getOperand(i), DAG.getIntPtrConstant(i));
}
return Result;
}
// Otherwise, expand into a number of unpckl*, start by extending each of
// our (non-undef) elements to the full vector width with the element in the
// bottom slot of the vector (which generates no code for SSE).
for (unsigned i = 0; i < NumElems; ++i) {
if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
else
V[i] = DAG.getUNDEF(VT);
}
// Next, we iteratively mix elements, e.g. for v4f32:
// Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
// : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
// Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
unsigned EltStride = NumElems >> 1;
while (EltStride != 0) {
for (unsigned i = 0; i < EltStride; ++i) {
// If V[i+EltStride] is undef and this is the first round of mixing,
// then it is safe to just drop this shuffle: V[i] is already in the
// right place, the one element (since it's the first round) being
// inserted as undef can be dropped. This isn't safe for successive
// rounds because they will permute elements within both vectors.
if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
EltStride == NumElems/2)
continue;
V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
EltStride >>= 1;
}
return V[0];
}
X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
// We support concatenate two MMX registers and place them in a MMX
// register. This is better than doing a stack convert.
DebugLoc dl = Op.getDebugLoc();
EVT ResVT = Op.getValueType();
assert(Op.getNumOperands() == 2);
assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
int Mask[2];
Wesley Peck
committed
SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
InVec = Op.getOperand(1);
if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
unsigned NumElts = ResVT.getVectorNumElements();
Wesley Peck
committed
VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
} else {
Wesley Peck
committed
InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
Mask[0] = 0; Mask[1] = 2;
VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
}
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
// v8i16 shuffles - Prefer shuffles in the following order:
// 1. [all] pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
// 3. [ssse3] 2 x pshufb + 1 x por
// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
Bruno Cardoso Lopes
committed
SDValue
X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
SelectionDAG &DAG) const {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
SmallVector<int, 8> MaskVals;
// Determine if more than 1 of the words in each of the low and high quadwords
// of the result come from the same quadword of one of the two inputs. Undef
// mask values count as coming from any quadword, for better codegen.
SmallVector<unsigned, 4> LoQuad(4);
SmallVector<unsigned, 4> HiQuad(4);
BitVector InputQuads(4);
for (unsigned i = 0; i < 8; ++i) {
SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
int EltIdx = SVOp->getMaskElt(i);
MaskVals.push_back(EltIdx);
if (EltIdx < 0) {
++Quad[0];
++Quad[1];
++Quad[2];
++Quad[3];
continue;
}
++Quad[EltIdx / 4];
InputQuads.set(EltIdx / 4);
}
unsigned MaxQuad = 1;
for (unsigned i = 0; i < 4; ++i) {
if (LoQuad[i] > MaxQuad) {
BestLoQuad = i;
MaxQuad = LoQuad[i];
}
}
MaxQuad = 1;
for (unsigned i = 0; i < 4; ++i) {
if (HiQuad[i] > MaxQuad) {
BestHiQuad = i;
MaxQuad = HiQuad[i];
}
}
// For SSSE3, If all 8 words of the result come from only 1 quadword of each
// of the two input vectors, shuffle them into one input vector so only a
// single pshufb instruction is necessary. If There are more than 2 input
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
Bruno Cardoso Lopes
committed
if (Subtarget->hasSSSE3()) {
if (InputQuads.count() == 2 && V1Used && V2Used) {
BestLoQuad = InputQuads.find_first();
BestHiQuad = InputQuads.find_next(BestLoQuad);
}
if (InputQuads.count() > 2) {
BestLoQuad = -1;
BestHiQuad = -1;
}
}
// If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
// the shuffle mask. If a quad is scored as -1, that means that it contains
// words from all 4 input quadwords.
SDValue NewV;
if (BestLoQuad >= 0 || BestHiQuad >= 0) {
SmallVector<int, 8> MaskV;
MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
Wesley Peck
committed
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
// Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
// source words for the shuffle, to aid later transformations.
bool AllWordsInNewV = true;
Mon P Wang
committed
bool InOrder[2] = { true, true };
for (unsigned i = 0; i != 8; ++i) {
int idx = MaskVals[i];
Mon P Wang
committed
if (idx != (int)i)
InOrder[i/4] = false;
if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
continue;
AllWordsInNewV = false;
break;
}
bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
if (AllWordsInNewV) {
for (int i = 0; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0)
continue;
idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
if ((idx != i) && idx < 4)
pshufhw = false;
if ((idx != i) && idx > 3)
pshuflw = false;
}
V1 = NewV;
V2Used = false;
BestLoQuad = 0;
BestHiQuad = 1;
}
// If we've eliminated the use of V2, and the new mask is a pshuflw or
// pshufhw, that's as cheap as it gets. Return the new shuffle.
Mon P Wang
committed
if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
unsigned TargetMask = 0;
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
Owen Anderson
committed
DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
X86::getShufflePSHUFLWImmediate(NewV.getNode());
V1 = NewV.getOperand(0);
Bruno Cardoso Lopes
committed
return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
Bruno Cardoso Lopes
committed
if (Subtarget->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If we have elements from both input vectors, set the high bit of the
// shuffle mask element to zero out elements that come from V2 in the V1
// mask, and elements that come from V1 in the V2 mask, so that the two
// results can be OR'd together.
bool TwoInputs = V1Used && V2Used;
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
if (TwoInputs && (EltIdx >= 16)) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
pshufbMask.clear();
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
if (EltIdx < 16) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
Bill Wendling
committed
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
}
Wesley Peck
committed
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
}
// If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
// and update MaskVals with new element order.
BitVector InOrder(8);
if (BestLoQuad >= 0) {
SmallVector<int, 8> MaskV;
for (int i = 0; i != 4; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
MaskV.push_back(-1);
InOrder.set(i);
} else if ((idx / 4) == BestLoQuad) {
MaskV.push_back(idx & 3);
InOrder.set(i);
} else {
MaskV.push_back(-1);
}
for (unsigned i = 4; i != 8; ++i)
MaskV.push_back(i);
Owen Anderson
committed
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
Bruno Cardoso Lopes
committed
if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFLWImmediate(NewV.getNode()),
DAG);
// If BestHi >= 0, generate a pshufhw to put the high elements in order,
// and update MaskVals with the new element order.
if (BestHiQuad >= 0) {
SmallVector<int, 8> MaskV;
for (unsigned i = 0; i != 4; ++i)
MaskV.push_back(i);
for (unsigned i = 4; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
MaskV.push_back(-1);
InOrder.set(i);
} else if ((idx / 4) == BestHiQuad) {
MaskV.push_back((idx & 3) + 4);
InOrder.set(i);
} else {
MaskV.push_back(-1);
Owen Anderson
committed
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
Bruno Cardoso Lopes
committed
if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFHWImmediate(NewV.getNode()),
DAG);
// In case BestHi & BestLo were both -1, which means each quadword has a word
// from each of the four input quadwords, calculate the InOrder bitvector now
// before falling through to the insert/extract cleanup.
if (BestLoQuad == -1 && BestHiQuad == -1) {
NewV = V1;
for (int i = 0; i != 8; ++i)
if (MaskVals[i] < 0 || MaskVals[i] == i)
InOrder.set(i);
}
// The other elements are put in the right place using pextrw and pinsrw.
for (unsigned i = 0; i != 8; ++i) {
if (InOrder[i])
continue;
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
SDValue ExtOp = (EltIdx < 8)
Owen Anderson
committed
? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
DAG.getIntPtrConstant(EltIdx))
Owen Anderson
committed
: DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
DAG.getIntPtrConstant(EltIdx - 8));
Owen Anderson
committed
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
DAG.getIntPtrConstant(i));
// v16i8 shuffles - Prefer shuffles in the following order:
// 1. [ssse3] 1 x pshufb
// 2. [ssse3] 2 x pshufb + 1 x por
// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
static
SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG,
const X86TargetLowering &TLI) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
SmallVector<int, 16> MaskVals;
SVOp->getMask(MaskVals);
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
// present, fall back to case 3.
// FIXME: kill V2Only once shuffles are canonizalized by getNode.
bool V1Only = true;
bool V2Only = true;
for (unsigned i = 0; i < 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
if (EltIdx < 16)
V2Only = false;
else
V1Only = false;
}
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
if (TLI.getSubtarget()->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If all result elements are from one input vector, then only translate
// undef mask values to 0x80 (zero out result) in the pshufb mask.
//
// Otherwise, we have elements from both input vectors, and must zero out
// elements that come from V2 in the first mask, and V1 in the second mask
// so that we can OR them together.
bool TwoInputs = !(V1Only || V2Only);
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
}
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
}
// If all the elements are from V2, assign it to V1 and return after
// building the first pshufb.
if (V2Only)
V1 = V2;
Owen Anderson
committed
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
if (!TwoInputs)
return V1;
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
pshufbMask.clear();
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 16) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
Owen Anderson
committed
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,