Newer
Older
// 1. [all] pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
// 3. [ssse3] 2 x pshufb + 1 x por
// 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
Bruno Cardoso Lopes
committed
SDValue
X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
SelectionDAG &DAG) const {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
SmallVector<int, 8> MaskVals;
// Determine if more than 1 of the words in each of the low and high quadwords
// of the result come from the same quadword of one of the two inputs. Undef
// mask values count as coming from any quadword, for better codegen.
SmallVector<unsigned, 4> LoQuad(4);
SmallVector<unsigned, 4> HiQuad(4);
BitVector InputQuads(4);
for (unsigned i = 0; i < 8; ++i) {
SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
int EltIdx = SVOp->getMaskElt(i);
MaskVals.push_back(EltIdx);
if (EltIdx < 0) {
++Quad[0];
++Quad[1];
++Quad[2];
++Quad[3];
continue;
}
++Quad[EltIdx / 4];
InputQuads.set(EltIdx / 4);
}
unsigned MaxQuad = 1;
for (unsigned i = 0; i < 4; ++i) {
if (LoQuad[i] > MaxQuad) {
BestLoQuad = i;
MaxQuad = LoQuad[i];
}
}
MaxQuad = 1;
for (unsigned i = 0; i < 4; ++i) {
if (HiQuad[i] > MaxQuad) {
BestHiQuad = i;
MaxQuad = HiQuad[i];
}
}
// For SSSE3, If all 8 words of the result come from only 1 quadword of each
// of the two input vectors, shuffle them into one input vector so only a
// single pshufb instruction is necessary. If There are more than 2 input
// quads, disable the next transformation since it does not help SSSE3.
bool V1Used = InputQuads[0] || InputQuads[1];
bool V2Used = InputQuads[2] || InputQuads[3];
Bruno Cardoso Lopes
committed
if (Subtarget->hasSSSE3()) {
if (InputQuads.count() == 2 && V1Used && V2Used) {
BestLoQuad = InputQuads.find_first();
BestHiQuad = InputQuads.find_next(BestLoQuad);
}
if (InputQuads.count() > 2) {
BestLoQuad = -1;
BestHiQuad = -1;
}
}
// If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
// the shuffle mask. If a quad is scored as -1, that means that it contains
// words from all 4 input quadwords.
SDValue NewV;
if (BestLoQuad >= 0 || BestHiQuad >= 0) {
SmallVector<int, 8> MaskV;
MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
Wesley Peck
committed
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);
// Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
// source words for the shuffle, to aid later transformations.
bool AllWordsInNewV = true;
Mon P Wang
committed
bool InOrder[2] = { true, true };
for (unsigned i = 0; i != 8; ++i) {
int idx = MaskVals[i];
Mon P Wang
committed
if (idx != (int)i)
InOrder[i/4] = false;
if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
continue;
AllWordsInNewV = false;
break;
}
bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
if (AllWordsInNewV) {
for (int i = 0; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0)
continue;
idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
if ((idx != i) && idx < 4)
pshufhw = false;
if ((idx != i) && idx > 3)
pshuflw = false;
}
V1 = NewV;
V2Used = false;
BestLoQuad = 0;
BestHiQuad = 1;
}
// If we've eliminated the use of V2, and the new mask is a pshuflw or
// pshufhw, that's as cheap as it gets. Return the new shuffle.
Mon P Wang
committed
if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
unsigned TargetMask = 0;
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
Owen Anderson
committed
DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
X86::getShufflePSHUFLWImmediate(NewV.getNode());
V1 = NewV.getOperand(0);
Bruno Cardoso Lopes
committed
return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
Bruno Cardoso Lopes
committed
if (Subtarget->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If we have elements from both input vectors, set the high bit of the
// shuffle mask element to zero out elements that come from V2 in the V1
// mask, and elements that come from V1 in the V2 mask, so that the two
// results can be OR'd together.
bool TwoInputs = V1Used && V2Used;
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
if (TwoInputs && (EltIdx >= 16)) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
pshufbMask.clear();
for (unsigned i = 0; i != 8; ++i) {
int EltIdx = MaskVals[i] * 2;
if (EltIdx < 16) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
Bill Wendling
committed
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
}
Wesley Peck
committed
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
}
// If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
// and update MaskVals with new element order.
BitVector InOrder(8);
if (BestLoQuad >= 0) {
SmallVector<int, 8> MaskV;
for (int i = 0; i != 4; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
MaskV.push_back(-1);
InOrder.set(i);
} else if ((idx / 4) == BestLoQuad) {
MaskV.push_back(idx & 3);
InOrder.set(i);
} else {
MaskV.push_back(-1);
}
for (unsigned i = 4; i != 8; ++i)
MaskV.push_back(i);
Owen Anderson
committed
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
Bruno Cardoso Lopes
committed
if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFLWImmediate(NewV.getNode()),
DAG);
// If BestHi >= 0, generate a pshufhw to put the high elements in order,
// and update MaskVals with the new element order.
if (BestHiQuad >= 0) {
SmallVector<int, 8> MaskV;
for (unsigned i = 0; i != 4; ++i)
MaskV.push_back(i);
for (unsigned i = 4; i != 8; ++i) {
int idx = MaskVals[i];
if (idx < 0) {
MaskV.push_back(-1);
InOrder.set(i);
} else if ((idx / 4) == BestHiQuad) {
MaskV.push_back((idx & 3) + 4);
InOrder.set(i);
} else {
MaskV.push_back(-1);
Owen Anderson
committed
NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
&MaskV[0]);
Bruno Cardoso Lopes
committed
if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
NewV.getOperand(0),
X86::getShufflePSHUFHWImmediate(NewV.getNode()),
DAG);
// In case BestHi & BestLo were both -1, which means each quadword has a word
// from each of the four input quadwords, calculate the InOrder bitvector now
// before falling through to the insert/extract cleanup.
if (BestLoQuad == -1 && BestHiQuad == -1) {
NewV = V1;
for (int i = 0; i != 8; ++i)
if (MaskVals[i] < 0 || MaskVals[i] == i)
InOrder.set(i);
}
// The other elements are put in the right place using pextrw and pinsrw.
for (unsigned i = 0; i != 8; ++i) {
if (InOrder[i])
continue;
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
SDValue ExtOp = (EltIdx < 8)
Owen Anderson
committed
? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
DAG.getIntPtrConstant(EltIdx))
Owen Anderson
committed
: DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
DAG.getIntPtrConstant(EltIdx - 8));
Owen Anderson
committed
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
DAG.getIntPtrConstant(i));
// v16i8 shuffles - Prefer shuffles in the following order:
// 1. [ssse3] 1 x pshufb
// 2. [ssse3] 2 x pshufb + 1 x por
// 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw
static
SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG,
const X86TargetLowering &TLI) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
SmallVector<int, 16> MaskVals;
SVOp->getMask(MaskVals);
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
// present, fall back to case 3.
// FIXME: kill V2Only once shuffles are canonizalized by getNode.
bool V1Only = true;
bool V2Only = true;
for (unsigned i = 0; i < 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 0)
continue;
if (EltIdx < 16)
V2Only = false;
else
V1Only = false;
}
// If SSSE3, use 1 pshufb instruction per vector with elements in the result.
if (TLI.getSubtarget()->hasSSSE3()) {
SmallVector<SDValue,16> pshufbMask;
// If all result elements are from one input vector, then only translate
// undef mask values to 0x80 (zero out result) in the pshufb mask.
//
// Otherwise, we have elements from both input vectors, and must zero out
// elements that come from V2 in the first mask, and V1 in the second mask
// so that we can OR them together.
bool TwoInputs = !(V1Only || V2Only);
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
}
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
}
// If all the elements are from V2, assign it to V1 and return after
// building the first pshufb.
if (V2Only)
V1 = V2;
Owen Anderson
committed
V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
if (!TwoInputs)
return V1;
// Calculate the shuffle mask for the second input, shuffle it, and
// OR it with the first shuffled input.
pshufbMask.clear();
for (unsigned i = 0; i != 16; ++i) {
int EltIdx = MaskVals[i];
if (EltIdx < 16) {
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
continue;
Owen Anderson
committed
pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
Owen Anderson
committed
V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
DAG.getNode(ISD::BUILD_VECTOR, dl,
Owen Anderson
committed
MVT::v16i8, &pshufbMask[0], 16));
return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
// No SSSE3 - Calculate in place words and then fix all out of place words
// With 0-16 extracts & inserts. Worst case is 16 bytes out of order from
// the 16 different words that comprise the two doublequadword input vectors.
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
SDValue NewV = V2Only ? V2 : V1;
for (int i = 0; i != 8; ++i) {
int Elt0 = MaskVals[i*2];
int Elt1 = MaskVals[i*2+1];
// This word of the result is all undef, skip it.
if (Elt0 < 0 && Elt1 < 0)
continue;
// This word of the result is already in the correct place, skip it.
if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
continue;
if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
continue;
SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
SDValue InsElt;
Mon P Wang
committed
// If Elt0 and Elt1 are defined, are consecutive, and can be load
// using a single extract together, load it and store it.
if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
Owen Anderson
committed
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
Mon P Wang
committed
DAG.getIntPtrConstant(Elt1 / 2));
Owen Anderson
committed
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
Mon P Wang
committed
DAG.getIntPtrConstant(i));
continue;
}
// If Elt1 is defined, extract it from the appropriate source. If the
Mon P Wang
committed
// source byte is not also odd, shift the extracted word left 8 bits
// otherwise clear the bottom 8 bits if we need to do an or.
Owen Anderson
committed
InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
DAG.getIntPtrConstant(Elt1 / 2));
if ((Elt1 & 1) == 0)
Owen Anderson
committed
InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
Owen Anderson
committed
DAG.getConstant(8,
TLI.getShiftAmountTy(InsElt.getValueType())));
Mon P Wang
committed
else if (Elt0 >= 0)
Owen Anderson
committed
InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
DAG.getConstant(0xFF00, MVT::i16));
}
// If Elt0 is defined, extract it from the appropriate source. If the
// source byte is not also even, shift the extracted word right 8 bits. If
// Elt1 was also defined, OR the extracted values together before
// inserting them in the result.
if (Elt0 >= 0) {
Owen Anderson
committed
SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
if ((Elt0 & 1) != 0)
Owen Anderson
committed
InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
Owen Anderson
committed
DAG.getConstant(8,
TLI.getShiftAmountTy(InsElt0.getValueType())));
Mon P Wang
committed
else if (Elt1 >= 0)
Owen Anderson
committed
InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
DAG.getConstant(0x00FF, MVT::i16));
InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
}
Owen Anderson
committed
NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
DAG.getIntPtrConstant(i));
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
}
Evan Cheng
committed
/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
Evan Cheng
committed
/// done when every pair / quad of shuffle mask elements point to elements in
/// the right sequence. e.g.
/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
Bruno Cardoso Lopes
committed
SelectionDAG &DAG, DebugLoc dl) {
Owen Anderson
committed
EVT VT = SVOp->getValueType(0);
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
unsigned NumElems = VT.getVectorNumElements();
Evan Cheng
committed
unsigned NewWidth = (NumElems == 4) ? 2 : 4;
EVT NewVT;
Owen Anderson
committed
switch (VT.getSimpleVT().SimpleTy) {
default: assert(false && "Unexpected!");
Owen Anderson
committed
case MVT::v4f32: NewVT = MVT::v2f64; break;
case MVT::v4i32: NewVT = MVT::v2i64; break;
case MVT::v8i16: NewVT = MVT::v4i32; break;
case MVT::v16i8: NewVT = MVT::v4i32; break;
Evan Cheng
committed
}
int Scale = NumElems / NewWidth;
SmallVector<int, 8> MaskVec;
for (unsigned i = 0; i < NumElems; i += Scale) {
int StartIdx = -1;
for (int j = 0; j < Scale; ++j) {
int EltIdx = SVOp->getMaskElt(i+j);
if (EltIdx < 0)
continue;
if (StartIdx == -1)
StartIdx = EltIdx - (EltIdx % Scale);
if (EltIdx != StartIdx + j)
}
if (StartIdx == -1)
MaskVec.push_back(-1);
else
MaskVec.push_back(StartIdx / Scale);
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
/// getVZextMovL - Return a zero-extending vector move low node.
Evan Cheng
committed
///
Owen Anderson
committed
static SDValue getVZextMovL(EVT VT, EVT OpVT,
SDValue SrcOp, SelectionDAG &DAG,
const X86Subtarget *Subtarget, DebugLoc dl) {
Owen Anderson
committed
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
Evan Cheng
committed
LoadSDNode *LD = NULL;
Gabor Greif
committed
if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
Evan Cheng
committed
LD = dyn_cast<LoadSDNode>(SrcOp);
if (!LD) {
// movssrr and movsdrr do not clear top bits. Try to use movd, movq
// instead.
if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
Evan Cheng
committed
SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
Wesley Peck
committed
SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
Evan Cheng
committed
// PR2108
Owen Anderson
committed
OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
OpVT,
Evan Cheng
committed
}
}
}
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
Wesley Peck
committed
DAG.getNode(ISD::BITCAST, dl,
Evan Cheng
committed
}
/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
/// which could not be matched by any known target speficic shuffle
static SDValue
LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
return SDValue();
}
Bruno Cardoso Lopes
committed
/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
/// 4 elements, and match them with several different shuffle types.
Bruno Cardoso Lopes
committed
LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
Owen Anderson
committed
EVT VT = SVOp->getValueType(0);
Bruno Cardoso Lopes
committed
assert(VT.getSizeInBits() == 128 && "Unsupported vector size");
Evan Cheng
committed
SmallVector<std::pair<int, int>, 8> Locs;
Locs.resize(4);
SmallVector<int, 8> Mask1(4U, -1);
SmallVector<int, 8> PermMask;
SVOp->getMask(PermMask);
Evan Cheng
committed
unsigned NumHi = 0;
unsigned NumLo = 0;
for (unsigned i = 0; i != 4; ++i) {
int Idx = PermMask[i];
if (Idx < 0) {
Evan Cheng
committed
Locs[i] = std::make_pair(-1, -1);
} else {
assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
if (Idx < 4) {
Evan Cheng
committed
Locs[i] = std::make_pair(0, NumLo);
Mask1[NumLo] = Idx;
Evan Cheng
committed
NumLo++;
} else {
Locs[i] = std::make_pair(1, NumHi);
if (2+NumHi < 4)
Mask1[2+NumHi] = Idx;
Evan Cheng
committed
NumHi++;
}
}
}
Evan Cheng
committed
if (NumLo <= 2 && NumHi <= 2) {
// If no more than two elements come from either vector. This can be
// implemented with two shuffles. First shuffle gather the elements.
// The second shuffle, which takes the first shuffle as both of its
// vector operands, put the elements into the right order.
V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
SmallVector<int, 8> Mask2(4U, -1);
Evan Cheng
committed
for (unsigned i = 0; i != 4; ++i) {
if (Locs[i].first == -1)
continue;
else {
unsigned Idx = (i < 2) ? 0 : 4;
Idx += Locs[i].first * 2 + Locs[i].second;
Mask2[i] = Idx;
Evan Cheng
committed
}
}
return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
} else if (NumLo == 3 || NumHi == 3) {
// Otherwise, we must have three elements from one vector, call it X, and
// one element from the other, call it Y. First, use a shufps to build an
// intermediate vector with the one element from Y and the element from X
// that will be in the same half in the final destination (the indexes don't
// matter). Then, use a shufps to build the final vector, taking the half
// containing the element from Y from the intermediate, and the other half
// from X.
if (NumHi == 3) {
// Normalize it so the 3 elements come from V1.
CommuteVectorShuffleMask(PermMask, VT);
std::swap(V1, V2);
}
// Find the element from V2.
unsigned HiIndex;
for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
int Val = PermMask[HiIndex];
if (Val < 0)
continue;
if (Val >= 4)
break;
}
Mask1[0] = PermMask[HiIndex];
Mask1[1] = -1;
Mask1[2] = PermMask[HiIndex^1];
Mask1[3] = -1;
V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
if (HiIndex >= 2) {
Mask1[0] = PermMask[0];
Mask1[1] = PermMask[1];
Mask1[2] = HiIndex & 1 ? 6 : 4;
Mask1[3] = HiIndex & 1 ? 4 : 6;
return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
} else {
Mask1[0] = HiIndex & 1 ? 2 : 0;
Mask1[1] = HiIndex & 1 ? 0 : 2;
Mask1[2] = PermMask[2];
Mask1[3] = PermMask[3];
if (Mask1[2] >= 0)
Mask1[2] += 4;
if (Mask1[3] >= 0)
Mask1[3] += 4;
return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
}
Evan Cheng
committed
}
// Break it into (shuffle shuffle_hi, shuffle_lo).
Locs.clear();
SmallVector<int,8> LoMask(4U, -1);
SmallVector<int,8> HiMask(4U, -1);
SmallVector<int,8> *MaskPtr = &LoMask;
Evan Cheng
committed
unsigned MaskIdx = 0;
unsigned LoIdx = 0;
unsigned HiIdx = 2;
for (unsigned i = 0; i != 4; ++i) {
if (i == 2) {
MaskPtr = &HiMask;
MaskIdx = 1;
LoIdx = 0;
HiIdx = 2;
}
int Idx = PermMask[i];
if (Idx < 0) {
Evan Cheng
committed
Locs[i] = std::make_pair(-1, -1);
} else if (Idx < 4) {
Evan Cheng
committed
Locs[i] = std::make_pair(MaskIdx, LoIdx);
(*MaskPtr)[LoIdx] = Idx;
Evan Cheng
committed
LoIdx++;
} else {
Locs[i] = std::make_pair(MaskIdx, HiIdx);
(*MaskPtr)[HiIdx] = Idx;
Evan Cheng
committed
HiIdx++;
}
}
SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
SmallVector<int, 8> MaskOps;
Evan Cheng
committed
for (unsigned i = 0; i != 4; ++i) {
if (Locs[i].first == -1) {
MaskOps.push_back(-1);
Evan Cheng
committed
} else {
unsigned Idx = Locs[i].first * 4 + Locs[i].second;
MaskOps.push_back(Idx);
Evan Cheng
committed
}
}
return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
Evan Cheng
committed
}
Bruno Cardoso Lopes
committed
static bool MayFoldVectorLoad(SDValue V) {
Wesley Peck
committed
if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
Bruno Cardoso Lopes
committed
V = V.getOperand(0);
if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
V = V.getOperand(0);
if (MayFoldLoad(V))
return true;
return false;
}
// FIXME: the version above should always be used. Since there's
// a bug where several vector shuffles can't be folded because the
// DAG is not updated during lowering and a node claims to have two
// uses while it only has one, use this version, and let isel match
// another instruction if the load really happens to have more than
// one use. Remove this version after this bug get fixed.
Evan Cheng
committed
// rdar://8434668, PR8156
static bool RelaxedMayFoldVectorLoad(SDValue V) {
Wesley Peck
committed
if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
5669
5670
5671
5672
5673
5674
5675
5676
5677
5678
5679
5680
5681
5682
5683
5684
5685
5686
5687
5688
5689
5690
5691
5692
5693
5694
5695
5696
5697
5698
5699
5700
5701
5702
5703
5704
5705
V = V.getOperand(0);
if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
V = V.getOperand(0);
if (ISD::isNormalLoad(V.getNode()))
return true;
return false;
}
/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by
/// a vector extract, and if both can be later optimized into a single load.
/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked
/// here because otherwise a target specific shuffle node is going to be
/// emitted for this shuffle, and the optimization not done.
/// FIXME: This is probably not the best approach, but fix the problem
/// until the right path is decided.
static
bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
const TargetLowering &TLI) {
EVT VT = V.getValueType();
ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V);
// Be sure that the vector shuffle is present in a pattern like this:
// (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr)
if (!V.hasOneUse())
return false;
SDNode *N = *V.getNode()->use_begin();
if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
return false;
SDValue EltNo = N->getOperand(1);
if (!isa<ConstantSDNode>(EltNo))
return false;
// If the bit convert changed the number of elements, it is unsafe
// to examine the mask.
bool HasShuffleIntoBitcast = false;
Wesley Peck
committed
if (V.getOpcode() == ISD::BITCAST) {
EVT SrcVT = V.getOperand(0).getValueType();
if (SrcVT.getVectorNumElements() != VT.getVectorNumElements())
return false;
V = V.getOperand(0);
HasShuffleIntoBitcast = true;
}
// Select the input vector, guarding against out of range extract vector.
unsigned NumElems = VT.getVectorNumElements();
unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt);
V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);
// Skip one more bit_convert if necessary
Wesley Peck
committed
if (V.getOpcode() == ISD::BITCAST)
5722
5723
5724
5725
5726
5727
5728
5729
5730
5731
5732
5733
5734
5735
5736
5737
5738
5739
5740
5741
5742
5743
5744
5745
5746
5747
5748
5749
5750
5751
5752
V = V.getOperand(0);
if (ISD::isNormalLoad(V.getNode())) {
// Is the original load suitable?
LoadSDNode *LN0 = cast<LoadSDNode>(V);
// FIXME: avoid the multi-use bug that is preventing lots of
// of foldings to be detected, this is still wrong of course, but
// give the temporary desired behavior, and if it happens that
// the load has real more uses, during isel it will not fold, and
// will generate poor code.
if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse()
return false;
if (!HasShuffleIntoBitcast)
return true;
// If there's a bitcast before the shuffle, check if the load type and
// alignment is valid.
unsigned Align = LN0->getAlignment();
unsigned NewAlign =
TLI.getTargetData()->getABITypeAlignment(
VT.getTypeForEVT(*DAG.getContext()));
if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
return false;
}
return true;
}
Evan Cheng
committed
static
SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
EVT VT = Op.getValueType();
// Canonizalize to v2f64.
Wesley Peck
committed
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
return DAG.getNode(ISD::BITCAST, dl, VT,
Evan Cheng
committed
getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
V1, DAG));
}
Bruno Cardoso Lopes
committed
static
SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
bool HasSSE2) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
assert(VT != MVT::v2i64 && "unsupported shuffle type");
if (HasSSE2 && VT == MVT::v2f64)
return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);
// v4f32 or v4i32
return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
}
Bruno Cardoso Lopes
committed
static
SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
"unsupported shuffle type");
if (V2.getOpcode() == ISD::UNDEF)
V2 = V1;
// v4i32 or v4f32
return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
}
Bruno Cardoso Lopes
committed
static
SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
unsigned NumElems = VT.getVectorNumElements();
// Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
// operand of these instructions is only memory, so check if there's a
// potencial load folding here, otherwise use SHUFPS or MOVSD to match the
// same masks.
bool CanFoldLoad = false;
// Trivial case, when V2 comes from a load.
Bruno Cardoso Lopes
committed
if (MayFoldVectorLoad(V2))
Bruno Cardoso Lopes
committed
CanFoldLoad = true;
// When V1 is a load, it can be folded later into a store in isel, example:
// (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
// turns into:
// (MOVLPSmr addr:$src1, VR128:$src2)
// So, recognize this potential and also use MOVLPS or MOVLPD
Bruno Cardoso Lopes
committed
if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
Bruno Cardoso Lopes
committed
CanFoldLoad = true;
Eric Christopher
committed
// Both of them can't be memory operations though.
if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2))
CanFoldLoad = false;
Owen Anderson
committed
Bruno Cardoso Lopes
committed
5825
5826
5827
5828
5829
5830
5831
5832
5833
5834
5835
5836
5837
5838
5839
5840
5841
5842
5843
5844
5845
5846
5847
5848
5849
5850
5851
5852
if (CanFoldLoad) {
if (HasSSE2 && NumElems == 2)
return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);
if (NumElems == 4)
return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
}
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
// movl and movlp will both match v2i64, but v2i64 is never matched by
// movl earlier because we make it strict to avoid messing with the movlp load
// folding logic (see the code above getMOVLP call). Match it here then,
// this is horrible, but will stay like this until we move all shuffle
// matching to x86 specific nodes. Note that for the 1st condition all
// types are matched with movsd.
if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
else if (HasSSE2)
return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
assert(VT != MVT::v4i32 && "unsupported shuffle type");
// Invert the operand order and use SHUFPS to match it.
return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
X86::getShuffleSHUFImmediate(SVOp), DAG);
}
static inline unsigned getUNPCKLOpcode(EVT VT) {
switch(VT.getSimpleVT().SimpleTy) {
case MVT::v4i32: return X86ISD::PUNPCKLDQ;
case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
Bruno Cardoso Lopes
committed
case MVT::v4f32: return X86ISD::UNPCKLPS;
case MVT::v2f64: return X86ISD::UNPCKLPD;
case MVT::v8f32: return X86ISD::VUNPCKLPSY;
case MVT::v4f64: return X86ISD::VUNPCKLPDY;
case MVT::v16i8: return X86ISD::PUNPCKLBW;
case MVT::v8i16: return X86ISD::PUNPCKLWD;
default:
}
return 0;
}
static inline unsigned getUNPCKHOpcode(EVT VT) {
switch(VT.getSimpleVT().SimpleTy) {
case MVT::v4i32: return X86ISD::PUNPCKHDQ;
case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
case MVT::v4f32: return X86ISD::UNPCKHPS;
case MVT::v2f64: return X86ISD::UNPCKHPD;
Bruno Cardoso Lopes
committed
case MVT::v8f32: return X86ISD::VUNPCKHPSY;
case MVT::v4f64: return X86ISD::VUNPCKHPDY;
case MVT::v16i8: return X86ISD::PUNPCKHBW;
case MVT::v8i16: return X86ISD::PUNPCKHWD;
default:
}
return 0;
}
static inline unsigned getVPERMILOpcode(EVT VT) {
switch(VT.getSimpleVT().SimpleTy) {
case MVT::v4i32:
case MVT::v4f32: return X86ISD::VPERMILPS;
case MVT::v2i64:
case MVT::v2f64: return X86ISD::VPERMILPD;
case MVT::v8i32:
case MVT::v8f32: return X86ISD::VPERMILPSY;
case MVT::v4i64:
case MVT::v4f64: return X86ISD::VPERMILPDY;
default:
llvm_unreachable("Unknown type for vpermil");
}
return 0;
}
Bruno Cardoso Lopes
committed
static
SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
const TargetLowering &TLI,
Bruno Cardoso Lopes
committed
const X86Subtarget *Subtarget) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
Owen Anderson
committed
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
Bruno Cardoso Lopes
committed
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
Bruno Cardoso Lopes
committed
Bruno Cardoso Lopes
committed
if (isZeroShuffle(SVOp))
return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
// Handle splat operations
if (SVOp->isSplat()) {
unsigned NumElem = VT.getVectorNumElements();
// Special case, this is the only place now where it's allowed to return
// a vector_shuffle operation without using a target specific node, because
// *hopefully* it will be optimized away by the dag combiner. FIXME: should
// this be moved to DAGCombine instead?
if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
return Op;
Bruno Cardoso Lopes
committed
// Since there's no native support for scalar_to_vector for 256-bit AVX, a
// 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this
// idiom and do the shuffle before the insertion, this yields less
// instructions in the end.
if (VT.is256BitVector() &&
V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
V1.getOperand(0).getOpcode() == ISD::UNDEF &&
V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR)
return PromoteVectorToScalarSplat(SVOp, DAG);
// Handle splats by matching through known shuffle masks
if ((VT.is128BitVector() && NumElem <= 4) ||
(VT.is256BitVector() && NumElem <= 8))
return SDValue();
// All i16 and i8 vector types can't be used directly by a generic shuffle
// instruction because the target has no such instruction. Generate shuffles
// which repeat i16 and i8 several times until they fit in i32, and then can
// be manipulated by target suported shuffles. After the insertion of the
// necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
return PromoteSplat(SVOp, DAG);
Evan Cheng
committed
// If the shuffle can be profitably rewritten as a narrower shuffle, then
// do it!
Owen Anderson
committed
if (VT == MVT::v8i16 || VT == MVT::v16i8) {
Bruno Cardoso Lopes
committed
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
Gabor Greif
committed
if (NewOp.getNode())
Wesley Peck
committed
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
Owen Anderson
committed
} else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
Evan Cheng
committed
// FIXME: Figure out a cleaner way to do this.
// Try to make use of movq to zero out the top part.
Gabor Greif
committed
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
Bruno Cardoso Lopes
committed
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
Gabor Greif
committed
if (NewOp.getNode()) {
if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
DAG, Subtarget, dl);
Evan Cheng
committed
}
Gabor Greif
committed
} else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
Bruno Cardoso Lopes
committed
SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
DAG, Subtarget, dl);
Evan Cheng
committed
}
}
Bruno Cardoso Lopes
committed
5970
5971
5972
5973
5974
5975
5976
5977
5978
5979
5980
5981
5982
5983
5984
5985
5986
5987
5988
5989
5990
5991
return SDValue();
}
SDValue
X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
unsigned NumElems = VT.getVectorNumElements();
bool isMMX = VT.getSizeInBits() == 64;
bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
bool V1IsSplat = false;
bool V2IsSplat = false;
bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX();
MachineFunction &MF = DAG.getMachineFunction();
bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
// Shuffle operations on MMX not supported.
if (isMMX)
Bruno Cardoso Lopes
committed
return Op;
// Vector shuffle lowering takes 3 steps:
//
// 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
// narrowing and commutation of operands should be handled.
// 2) Matching of shuffles with known shuffle masks to x86 target specific