X86ISelLowering.cpp

// 1. [all]   pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
// 3. [ssse3] 2 x pshufb + 1 x por
// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
SDValue
X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
                                            SelectionDAG &DAG) const {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  DebugLoc dl = SVOp->getDebugLoc();
  SmallVector<int, 8> MaskVals;

  // Determine if more than 1 of the words in each of the low and high quadwords
  // of the result come from the same quadword of one of the two inputs.  Undef
  // mask values count as coming from any quadword, for better codegen.
  SmallVector<unsigned, 4> LoQuad(4);
  SmallVector<unsigned, 4> HiQuad(4);
  BitVector InputQuads(4);
  for (unsigned i = 0; i < 8; ++i) {
    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
    int EltIdx = SVOp->getMaskElt(i);
    MaskVals.push_back(EltIdx);
    if (EltIdx < 0) {
      ++Quad[0];
      ++Quad[1];
      ++Quad[2];
      ++Quad[3];
      continue;
    }
    ++Quad[EltIdx / 4];
    InputQuads.set(EltIdx / 4);
  }

  int BestLoQuad = -1;
  unsigned MaxQuad = 1;
  for (unsigned i = 0; i < 4; ++i) {
    if (LoQuad[i] > MaxQuad) {
      BestLoQuad = i;
      MaxQuad = LoQuad[i];
    }
  }

  int BestHiQuad = -1;
  MaxQuad = 1;
  for (unsigned i = 0; i < 4; ++i) {
    if (HiQuad[i] > MaxQuad) {
      BestHiQuad = i;
      MaxQuad = HiQuad[i];
    }
  }

  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
  // of the two input vectors, shuffle them into one input vector so only a
  // single pshufb instruction is necessary. If There are more than 2 input
  // quads, disable the next transformation since it does not help SSSE3.
  bool V1Used = InputQuads[0] || InputQuads[1];
  bool V2Used = InputQuads[2] || InputQuads[3];
  if (Subtarget->hasSSSE3()) {
    if (InputQuads.count() == 2 && V1Used && V2Used) {
      BestLoQuad = InputQuads.find_first();
      BestHiQuad = InputQuads.find_next(BestLoQuad);
    }
    if (InputQuads.count() > 2) {
      BestLoQuad = -1;
      BestHiQuad = -1;
    }
  }

  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
  // the shuffle mask.  If a quad is scored as -1, that means that it contains
  // words from all 4 input quadwords.
  SDValue NewV;
  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
    SmallVector<int, 8> MaskV;
    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1),
                  DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]);
    NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV);

    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
    // source words for the shuffle, to aid later transformations.
    bool AllWordsInNewV = true;
    bool InOrder[2] = { true, true };
    for (unsigned i = 0; i != 8; ++i) {
      int idx = MaskVals[i];
      if (idx != (int)i)
        InOrder[i/4] = false;
      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
        continue;
      AllWordsInNewV = false;
      break;
    }

    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
    if (AllWordsInNewV) {
      for (int i = 0; i != 8; ++i) {
        int idx = MaskVals[i];
        if (idx < 0)
          continue;
        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
        if ((idx != i) && idx < 4)
          pshufhw = false;
        if ((idx != i) && idx > 3)
          pshuflw = false;
      }
      V1 = NewV;
      V2Used = false;
      BestLoQuad = 0;
      BestHiQuad = 1;
    }

    // If we've eliminated the use of V2, and the new mask is a pshuflw or
    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
      unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW;
      unsigned TargetMask = 0;
      NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
      TargetMask = pshufhw ? X86::getShufflePSHUFHWImmediate(NewV.getNode()):
                             X86::getShufflePSHUFLWImmediate(NewV.getNode());
      V1 = NewV.getOperand(0);
      return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG);
    }
  }

  // If we have SSSE3, and all words of the result are from 1 input vector,
  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
  // is present, fall back to case 4.
  if (Subtarget->hasSSSE3()) {
    SmallVector<SDValue,16> pshufbMask;

    // If we have elements from both input vectors, set the high bit of the
    // shuffle mask element to zero out elements that come from V2 in the V1
    // mask, and elements that come from V1 in the V2 mask, so that the two
    // results can be OR'd together.
    bool TwoInputs = V1Used && V2Used;
    for (unsigned i = 0; i != 8; ++i) {
      int EltIdx = MaskVals[i] * 2;
      if (TwoInputs && (EltIdx >= 16)) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
    }
    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    if (!TwoInputs)
      return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);

    // Calculate the shuffle mask for the second input, shuffle it, and
    // OR it with the first shuffled input.
    pshufbMask.clear();
    for (unsigned i = 0; i != 8; ++i) {
      int EltIdx = MaskVals[i] * 2;
      if (EltIdx < 16) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
    }
    V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V2);
    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
    return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
  }

  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
  // and update MaskVals with new element order.
  BitVector InOrder(8);
  if (BestLoQuad >= 0) {
    SmallVector<int, 8> MaskV;
    for (int i = 0; i != 4; ++i) {
      int idx = MaskVals[i];
      if (idx < 0) {
        MaskV.push_back(-1);
        InOrder.set(i);
      } else if ((idx / 4) == BestLoQuad) {
        MaskV.push_back(idx & 3);
        InOrder.set(i);
      } else {
        MaskV.push_back(-1);
      }
    }
    for (unsigned i = 4; i != 8; ++i)
      MaskV.push_back(i);
    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                &MaskV[0]);

    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
      NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                               NewV.getOperand(0),
                               X86::getShufflePSHUFLWImmediate(NewV.getNode()),
                               DAG);
  }

  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
  // and update MaskVals with the new element order.
  if (BestHiQuad >= 0) {
    SmallVector<int, 8> MaskV;
    for (unsigned i = 0; i != 4; ++i)
      MaskV.push_back(i);
    for (unsigned i = 4; i != 8; ++i) {
      int idx = MaskVals[i];
      if (idx < 0) {
        MaskV.push_back(-1);
        InOrder.set(i);
      } else if ((idx / 4) == BestHiQuad) {
        MaskV.push_back((idx & 3) + 4);
        InOrder.set(i);
      } else {
        MaskV.push_back(-1);
      }
    }
    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                &MaskV[0]);

    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3())
      NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                              NewV.getOperand(0),
                              X86::getShufflePSHUFHWImmediate(NewV.getNode()),
                              DAG);
  }

  // In case BestHi & BestLo were both -1, which means each quadword has a word
  // from each of the four input quadwords, calculate the InOrder bitvector now
  // before falling through to the insert/extract cleanup.
  if (BestLoQuad == -1 && BestHiQuad == -1) {
    NewV = V1;
    for (int i = 0; i != 8; ++i)
      if (MaskVals[i] < 0 || MaskVals[i] == i)
        InOrder.set(i);
  }

  // The other elements are put in the right place using pextrw and pinsrw.
  for (unsigned i = 0; i != 8; ++i) {
    if (InOrder[i])
      continue;
    int EltIdx = MaskVals[i];
    if (EltIdx < 0)
      continue;
    SDValue ExtOp = (EltIdx < 8)
    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
                  DAG.getIntPtrConstant(EltIdx))
    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
                  DAG.getIntPtrConstant(EltIdx - 8));
    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
                       DAG.getIntPtrConstant(i));
  }
  return NewV;
}

// v16i8 shuffles - Prefer shuffles in the following order:
// 1. [ssse3] 1 x pshufb
// 2. [ssse3] 2 x pshufb + 1 x por
// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
static
SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
                                 SelectionDAG &DAG,
                                 const X86TargetLowering &TLI) {
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  DebugLoc dl = SVOp->getDebugLoc();
  SmallVector<int, 16> MaskVals;
  SVOp->getMask(MaskVals);

  // If we have SSSE3, case 1 is generated when all result bytes come from
  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
  // present, fall back to case 3.
  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
  bool V1Only = true;
  bool V2Only = true;
  for (unsigned i = 0; i < 16; ++i) {
    int EltIdx = MaskVals[i];
    if (EltIdx < 0)
      continue;
    if (EltIdx < 16)
      V2Only = false;
    else
      V1Only = false;
  }

  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
  if (TLI.getSubtarget()->hasSSSE3()) {
    SmallVector<SDValue,16> pshufbMask;

    // If all result elements are from one input vector, then only translate
    // undef mask values to 0x80 (zero out result) in the pshufb mask.
    //
    // Otherwise, we have elements from both input vectors, and must zero out
    // elements that come from V2 in the first mask, and V1 in the second mask
    // so that we can OR them together.
    bool TwoInputs = !(V1Only || V2Only);
    for (unsigned i = 0; i != 16; ++i) {
      int EltIdx = MaskVals[i];
      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
    }
    // If all the elements are from V2, assign it to V1 and return after
    // building the first pshufb.
    if (V2Only)
      V1 = V2;
    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    if (!TwoInputs)
      return V1;

    // Calculate the shuffle mask for the second input, shuffle it, and
    // OR it with the first shuffled input.
    pshufbMask.clear();
    for (unsigned i = 0; i != 16; ++i) {
      int EltIdx = MaskVals[i];
      if (EltIdx < 16) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
    }
    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
  }

  // No SSSE3 - Calculate in place words and then fix all out of place words
  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
  // the 16 different words that comprise the two doublequadword input vectors.
  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1);
  V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2);
  SDValue NewV = V2Only ? V2 : V1;
  for (int i = 0; i != 8; ++i) {
    int Elt0 = MaskVals[i*2];
    int Elt1 = MaskVals[i*2+1];

    // This word of the result is all undef, skip it.
    if (Elt0 < 0 && Elt1 < 0)
      continue;

    // This word of the result is already in the correct place, skip it.
    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
      continue;
    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
      continue;

    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
    SDValue InsElt;

    // If Elt0 and Elt1 are defined, are consecutive, and can be load
    // using a single extract together, load it and store it.
    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
                           DAG.getIntPtrConstant(Elt1 / 2));
      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
                        DAG.getIntPtrConstant(i));
      continue;
    }

    // If Elt1 is defined, extract it from the appropriate source.  If the
    // source byte is not also odd, shift the extracted word left 8 bits
    // otherwise clear the bottom 8 bits if we need to do an or.
    if (Elt1 >= 0) {
      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
                           DAG.getIntPtrConstant(Elt1 / 2));
      if ((Elt1 & 1) == 0)
        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
                             DAG.getConstant(8,
                                  TLI.getShiftAmountTy(InsElt.getValueType())));
      else if (Elt0 >= 0)
        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
                             DAG.getConstant(0xFF00, MVT::i16));
    }
    // If Elt0 is defined, extract it from the appropriate source.  If the
    // source byte is not also even, shift the extracted word right 8 bits. If
    // Elt1 was also defined, OR the extracted values together before
    // inserting them in the result.
    if (Elt0 >= 0) {
      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
      if ((Elt0 & 1) != 0)
        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
                              DAG.getConstant(8,
                                 TLI.getShiftAmountTy(InsElt0.getValueType())));
      else if (Elt1 >= 0)
        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
                             DAG.getConstant(0x00FF, MVT::i16));
      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
                         : InsElt0;
    }
    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
                       DAG.getIntPtrConstant(i));
  }
  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
}

/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
/// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
/// done when every pair / quad of shuffle mask elements point to elements in
/// the right sequence. e.g.
/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                 SelectionDAG &DAG, DebugLoc dl) {
  EVT VT = SVOp->getValueType(0);
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  unsigned NumElems = VT.getVectorNumElements();
  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
  EVT NewVT;
  switch (VT.getSimpleVT().SimpleTy) {
  default: assert(false && "Unexpected!");
  case MVT::v4f32: NewVT = MVT::v2f64; break;
  case MVT::v4i32: NewVT = MVT::v2i64; break;
  case MVT::v8i16: NewVT = MVT::v4i32; break;
  case MVT::v16i8: NewVT = MVT::v4i32; break;
  }

  int Scale = NumElems / NewWidth;
  SmallVector<int, 8> MaskVec;
  for (unsigned i = 0; i < NumElems; i += Scale) {
    int StartIdx = -1;
    for (int j = 0; j < Scale; ++j) {
      int EltIdx = SVOp->getMaskElt(i+j);
      if (EltIdx < 0)
        continue;
      if (StartIdx == -1)
        StartIdx = EltIdx - (EltIdx % Scale);
      if (EltIdx != StartIdx + j)
        return SDValue();
    }
    if (StartIdx == -1)
      MaskVec.push_back(-1);
    else
      MaskVec.push_back(StartIdx / Scale);
  }

  V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1);
  V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2);
  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
}

/// getVZextMovL - Return a zero-extending vector move low node.
///
static SDValue getVZextMovL(EVT VT, EVT OpVT,
                            SDValue SrcOp, SelectionDAG &DAG,
                            const X86Subtarget *Subtarget, DebugLoc dl) {
  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
    LoadSDNode *LD = NULL;
    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
      LD = dyn_cast<LoadSDNode>(SrcOp);
    if (!LD) {
      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
      // instead.
      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
      if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) &&
          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
          SrcOp.getOperand(0).getOpcode() == ISD::BITCAST &&
          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
        // PR2108
        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
        return DAG.getNode(ISD::BITCAST, dl, VT,
                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                   OpVT,
                                                   SrcOp.getOperand(0)
                                                          .getOperand(0))));
      }
    }
  }

  return DAG.getNode(ISD::BITCAST, dl, VT,
                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
                                 DAG.getNode(ISD::BITCAST, dl,
                                             OpVT, SrcOp)));
}

/// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles
/// which could not be matched by any known target speficic shuffle
static SDValue
LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
  return SDValue();
}

/// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with
/// 4 elements, and match them with several different shuffle types.
static SDValue
LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  DebugLoc dl = SVOp->getDebugLoc();
  EVT VT = SVOp->getValueType(0);

  assert(VT.getSizeInBits() == 128 && "Unsupported vector size");

  SmallVector<std::pair<int, int>, 8> Locs;
  Locs.resize(4);
  SmallVector<int, 8> Mask1(4U, -1);
  SmallVector<int, 8> PermMask;
  SVOp->getMask(PermMask);

  unsigned NumHi = 0;
  unsigned NumLo = 0;
  for (unsigned i = 0; i != 4; ++i) {
    int Idx = PermMask[i];
    if (Idx < 0) {
      Locs[i] = std::make_pair(-1, -1);
    } else {
      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
      if (Idx < 4) {
        Locs[i] = std::make_pair(0, NumLo);
        Mask1[NumLo] = Idx;
        NumLo++;
      } else {
        Locs[i] = std::make_pair(1, NumHi);
        if (2+NumHi < 4)
          Mask1[2+NumHi] = Idx;
        NumHi++;
      }
    }
  }

  if (NumLo <= 2 && NumHi <= 2) {
    // If no more than two elements come from either vector. This can be
    // implemented with two shuffles. First shuffle gather the elements.
    // The second shuffle, which takes the first shuffle as both of its
    // vector operands, put the elements into the right order.
    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);

    SmallVector<int, 8> Mask2(4U, -1);

    for (unsigned i = 0; i != 4; ++i) {
      if (Locs[i].first == -1)
        continue;
      else {
        unsigned Idx = (i < 2) ? 0 : 4;
        Idx += Locs[i].first * 2 + Locs[i].second;
        Mask2[i] = Idx;
      }
    }

    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
  } else if (NumLo == 3 || NumHi == 3) {
    // Otherwise, we must have three elements from one vector, call it X, and
    // one element from the other, call it Y.  First, use a shufps to build an
    // intermediate vector with the one element from Y and the element from X
    // that will be in the same half in the final destination (the indexes don't
    // matter). Then, use a shufps to build the final vector, taking the half
    // containing the element from Y from the intermediate, and the other half
    // from X.
    if (NumHi == 3) {
      // Normalize it so the 3 elements come from V1.
      CommuteVectorShuffleMask(PermMask, VT);
      std::swap(V1, V2);
    }

    // Find the element from V2.
    unsigned HiIndex;
    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
      int Val = PermMask[HiIndex];
      if (Val < 0)
        continue;
      if (Val >= 4)
        break;
    }

    Mask1[0] = PermMask[HiIndex];
    Mask1[1] = -1;
    Mask1[2] = PermMask[HiIndex^1];
    Mask1[3] = -1;
    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);

    if (HiIndex >= 2) {
      Mask1[0] = PermMask[0];
      Mask1[1] = PermMask[1];
      Mask1[2] = HiIndex & 1 ? 6 : 4;
      Mask1[3] = HiIndex & 1 ? 4 : 6;
      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
    } else {
      Mask1[0] = HiIndex & 1 ? 2 : 0;
      Mask1[1] = HiIndex & 1 ? 0 : 2;
      Mask1[2] = PermMask[2];
      Mask1[3] = PermMask[3];
      if (Mask1[2] >= 0)
        Mask1[2] += 4;
      if (Mask1[3] >= 0)
        Mask1[3] += 4;
      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
    }
  }

  // Break it into (shuffle shuffle_hi, shuffle_lo).
  Locs.clear();
  Locs.resize(4);
  SmallVector<int,8> LoMask(4U, -1);
  SmallVector<int,8> HiMask(4U, -1);

  SmallVector<int,8> *MaskPtr = &LoMask;
  unsigned MaskIdx = 0;
  unsigned LoIdx = 0;
  unsigned HiIdx = 2;
  for (unsigned i = 0; i != 4; ++i) {
    if (i == 2) {
      MaskPtr = &HiMask;
      MaskIdx = 1;
      LoIdx = 0;
      HiIdx = 2;
    }
    int Idx = PermMask[i];
    if (Idx < 0) {
      Locs[i] = std::make_pair(-1, -1);
    } else if (Idx < 4) {
      Locs[i] = std::make_pair(MaskIdx, LoIdx);
      (*MaskPtr)[LoIdx] = Idx;
      LoIdx++;
    } else {
      Locs[i] = std::make_pair(MaskIdx, HiIdx);
      (*MaskPtr)[HiIdx] = Idx;
      HiIdx++;
    }
  }

  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
  SmallVector<int, 8> MaskOps;
  for (unsigned i = 0; i != 4; ++i) {
    if (Locs[i].first == -1) {
      MaskOps.push_back(-1);
    } else {
      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
      MaskOps.push_back(Idx);
    }
  }
  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
}

static bool MayFoldVectorLoad(SDValue V) {
  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
    V = V.getOperand(0);
  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
    V = V.getOperand(0);
  if (MayFoldLoad(V))
    return true;
  return false;
}

// FIXME: the version above should always be used. Since there's
// a bug where several vector shuffles can't be folded because the
// DAG is not updated during lowering and a node claims to have two
// uses while it only has one, use this version, and let isel match
// another instruction if the load really happens to have more than
// one use. Remove this version after this bug get fixed.
// rdar://8434668, PR8156
static bool RelaxedMayFoldVectorLoad(SDValue V) {
  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
    V = V.getOperand(0);
  if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
    V = V.getOperand(0);
  if (ISD::isNormalLoad(V.getNode()))
    return true;
  return false;
}

/// CanFoldShuffleIntoVExtract - Check if the current shuffle is used by
/// a vector extract, and if both can be later optimized into a single load.
/// This is done in visitEXTRACT_VECTOR_ELT and the conditions are checked
/// here because otherwise a target specific shuffle node is going to be
/// emitted for this shuffle, and the optimization not done.
/// FIXME: This is probably not the best approach, but fix the problem
/// until the right path is decided.
static
bool CanXFormVExtractWithShuffleIntoLoad(SDValue V, SelectionDAG &DAG,
                                         const TargetLowering &TLI) {
  EVT VT = V.getValueType();
  ShuffleVectorSDNode *SVOp = dyn_cast<ShuffleVectorSDNode>(V);

  // Be sure that the vector shuffle is present in a pattern like this:
  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), c) -> (f32 load $addr)
  if (!V.hasOneUse())
    return false;

  SDNode *N = *V.getNode()->use_begin();
  if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
    return false;

  SDValue EltNo = N->getOperand(1);
  if (!isa<ConstantSDNode>(EltNo))
    return false;

  // If the bit convert changed the number of elements, it is unsafe
  // to examine the mask.
  bool HasShuffleIntoBitcast = false;
  if (V.getOpcode() == ISD::BITCAST) {
    EVT SrcVT = V.getOperand(0).getValueType();
    if (SrcVT.getVectorNumElements() != VT.getVectorNumElements())
      return false;
    V = V.getOperand(0);
    HasShuffleIntoBitcast = true;
  }

  // Select the input vector, guarding against out of range extract vector.
  unsigned NumElems = VT.getVectorNumElements();
  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
  int Idx = (Elt > NumElems) ? -1 : SVOp->getMaskElt(Elt);
  V = (Idx < (int)NumElems) ? V.getOperand(0) : V.getOperand(1);

  // Skip one more bit_convert if necessary
  if (V.getOpcode() == ISD::BITCAST)
    V = V.getOperand(0);

  if (ISD::isNormalLoad(V.getNode())) {
    // Is the original load suitable?
    LoadSDNode *LN0 = cast<LoadSDNode>(V);

    // FIXME: avoid the multi-use bug that is preventing lots of
    // of foldings to be detected, this is still wrong of course, but
    // give the temporary desired behavior, and if it happens that
    // the load has real more uses, during isel it will not fold, and
    // will generate poor code.
    if (!LN0 || LN0->isVolatile()) // || !LN0->hasOneUse()
      return false;

    if (!HasShuffleIntoBitcast)
      return true;

    // If there's a bitcast before the shuffle, check if the load type and
    // alignment is valid.
    unsigned Align = LN0->getAlignment();
    unsigned NewAlign =
      TLI.getTargetData()->getABITypeAlignment(
                                    VT.getTypeForEVT(*DAG.getContext()));

    if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
      return false;
  }

  return true;
}

static
SDValue getMOVDDup(SDValue &Op, DebugLoc &dl, SDValue V1, SelectionDAG &DAG) {
  EVT VT = Op.getValueType();

  // Canonizalize to v2f64.
  V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, V1);
  return DAG.getNode(ISD::BITCAST, dl, VT,
                     getTargetShuffleNode(X86ISD::MOVDDUP, dl, MVT::v2f64,
                                          V1, DAG));
}

static
SDValue getMOVLowToHigh(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG,
                        bool HasSSE2) {
  SDValue V1 = Op.getOperand(0);
  SDValue V2 = Op.getOperand(1);
  EVT VT = Op.getValueType();

  assert(VT != MVT::v2i64 && "unsupported shuffle type");

  if (HasSSE2 && VT == MVT::v2f64)
    return getTargetShuffleNode(X86ISD::MOVLHPD, dl, VT, V1, V2, DAG);

  // v4f32 or v4i32
  return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V2, DAG);
}

static
SDValue getMOVHighToLow(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG) {
  SDValue V1 = Op.getOperand(0);
  SDValue V2 = Op.getOperand(1);
  EVT VT = Op.getValueType();

  assert((VT == MVT::v4i32 || VT == MVT::v4f32) &&
         "unsupported shuffle type");

  if (V2.getOpcode() == ISD::UNDEF)
    V2 = V1;

  // v4i32 or v4f32
  return getTargetShuffleNode(X86ISD::MOVHLPS, dl, VT, V1, V2, DAG);
}

static
SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
  SDValue V1 = Op.getOperand(0);
  SDValue V2 = Op.getOperand(1);
  EVT VT = Op.getValueType();
  unsigned NumElems = VT.getVectorNumElements();

  // Use MOVLPS and MOVLPD in case V1 or V2 are loads. During isel, the second
  // operand of these instructions is only memory, so check if there's a
  // potencial load folding here, otherwise use SHUFPS or MOVSD to match the
  // same masks.
  bool CanFoldLoad = false;

  // Trivial case, when V2 comes from a load.
  if (MayFoldVectorLoad(V2))
    CanFoldLoad = true;

  // When V1 is a load, it can be folded later into a store in isel, example:
  //  (store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1)
  //    turns into:
  //  (MOVLPSmr addr:$src1, VR128:$src2)
  // So, recognize this potential and also use MOVLPS or MOVLPD
  if (MayFoldVectorLoad(V1) && MayFoldIntoStore(Op))
    CanFoldLoad = true;

  // Both of them can't be memory operations though.
  if (MayFoldVectorLoad(V1) && MayFoldVectorLoad(V2))
    CanFoldLoad = false;

  if (CanFoldLoad) {
    if (HasSSE2 && NumElems == 2)
      return getTargetShuffleNode(X86ISD::MOVLPD, dl, VT, V1, V2, DAG);

    if (NumElems == 4)
      return getTargetShuffleNode(X86ISD::MOVLPS, dl, VT, V1, V2, DAG);
  }

  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
  // movl and movlp will both match v2i64, but v2i64 is never matched by
  // movl earlier because we make it strict to avoid messing with the movlp load
  // folding logic (see the code above getMOVLP call). Match it here then,
  // this is horrible, but will stay like this until we move all shuffle
  // matching to x86 specific nodes. Note that for the 1st condition all
  // types are matched with movsd.
  if ((HasSSE2 && NumElems == 2) || !X86::isMOVLMask(SVOp))
    return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);
  else if (HasSSE2)
    return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);


  assert(VT != MVT::v4i32 && "unsupported shuffle type");

  // Invert the operand order and use SHUFPS to match it.
  return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V2, V1,
                              X86::getShuffleSHUFImmediate(SVOp), DAG);
}

static inline unsigned getUNPCKLOpcode(EVT VT) {
  switch(VT.getSimpleVT().SimpleTy) {
  case MVT::v4i32: return X86ISD::PUNPCKLDQ;
  case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
  case MVT::v4f32: return X86ISD::UNPCKLPS;
  case MVT::v2f64: return X86ISD::UNPCKLPD;
  case MVT::v8f32: return X86ISD::VUNPCKLPSY;
  case MVT::v4f64: return X86ISD::VUNPCKLPDY;
  case MVT::v16i8: return X86ISD::PUNPCKLBW;
  case MVT::v8i16: return X86ISD::PUNPCKLWD;
  default:
    llvm_unreachable("Unknown type for unpckl");
  }
  return 0;
}

static inline unsigned getUNPCKHOpcode(EVT VT) {
  switch(VT.getSimpleVT().SimpleTy) {
  case MVT::v4i32: return X86ISD::PUNPCKHDQ;
  case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
  case MVT::v4f32: return X86ISD::UNPCKHPS;
  case MVT::v2f64: return X86ISD::UNPCKHPD;
  case MVT::v8f32: return X86ISD::VUNPCKHPSY;
  case MVT::v4f64: return X86ISD::VUNPCKHPDY;
  case MVT::v16i8: return X86ISD::PUNPCKHBW;
  case MVT::v8i16: return X86ISD::PUNPCKHWD;
  default:
    llvm_unreachable("Unknown type for unpckh");
  }
  return 0;
}

static inline unsigned getVPERMILOpcode(EVT VT) {
  switch(VT.getSimpleVT().SimpleTy) {
  case MVT::v4i32:
  case MVT::v4f32: return X86ISD::VPERMILPS;
  case MVT::v2i64:
  case MVT::v2f64: return X86ISD::VPERMILPD;
  case MVT::v8i32:
  case MVT::v8f32: return X86ISD::VPERMILPSY;
  case MVT::v4i64:
  case MVT::v4f64: return X86ISD::VPERMILPDY;
  default:
    llvm_unreachable("Unknown type for vpermil");
  }
  return 0;
}

static
SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                               const TargetLowering &TLI,
                               const X86Subtarget *Subtarget) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
  EVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  SDValue V1 = Op.getOperand(0);
  SDValue V2 = Op.getOperand(1);

  if (isZeroShuffle(SVOp))
    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);

  // Handle splat operations
  if (SVOp->isSplat()) {
    unsigned NumElem = VT.getVectorNumElements();
    // Special case, this is the only place now where it's allowed to return
    // a vector_shuffle operation without using a target specific node, because
    // *hopefully* it will be optimized away by the dag combiner. FIXME: should
    // this be moved to DAGCombine instead?
    if (NumElem <= 4 && CanXFormVExtractWithShuffleIntoLoad(Op, DAG, TLI))
      return Op;

    // Since there's no native support for scalar_to_vector for 256-bit AVX, a
    // 128-bit scalar_to_vector + INSERT_SUBVECTOR is generated. Recognize this
    // idiom and do the shuffle before the insertion, this yields less
    // instructions in the end.
    if (VT.is256BitVector() &&
        V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
        V1.getOperand(0).getOpcode() == ISD::UNDEF &&
        V1.getOperand(1).getOpcode() == ISD::SCALAR_TO_VECTOR)
      return PromoteVectorToScalarSplat(SVOp, DAG);

    // Handle splats by matching through known shuffle masks
    if ((VT.is128BitVector() && NumElem <= 4) ||
        (VT.is256BitVector() && NumElem <= 8))
      return SDValue();

    // All i16 and i8 vector types can't be used directly by a generic shuffle
    // instruction because the target has no such instruction. Generate shuffles
    // which repeat i16 and i8 several times until they fit in i32, and then can
    // be manipulated by target suported shuffles. After the insertion of the
    // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
    return PromoteSplat(SVOp, DAG);
  }

  // If the shuffle can be profitably rewritten as a narrower shuffle, then
  // do it!
  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
    if (NewOp.getNode())
      return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
    // FIXME: Figure out a cleaner way to do this.
    // Try to make use of movq to zero out the top part.
    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
      if (NewOp.getNode()) {
        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
                              DAG, Subtarget, dl);
      }
    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
                            DAG, Subtarget, dl);
    }
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
  SDValue V1 = Op.getOperand(0);
  SDValue V2 = Op.getOperand(1);
  EVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  unsigned NumElems = VT.getVectorNumElements();
  bool isMMX = VT.getSizeInBits() == 64;
  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
  bool V1IsSplat = false;
  bool V2IsSplat = false;
  bool HasSSE2 = Subtarget->hasSSE2() || Subtarget->hasAVX();
  bool HasSSE3 = Subtarget->hasSSE3() || Subtarget->hasAVX();
  bool HasSSSE3 = Subtarget->hasSSSE3() || Subtarget->hasAVX();
  MachineFunction &MF = DAG.getMachineFunction();
  bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);

  // Shuffle operations on MMX not supported.
  if (isMMX)
    return Op;

  // Vector shuffle lowering takes 3 steps:
  //
  // 1) Normalize the input vectors. Here splats, zeroed vectors, profitable
  //    narrowing and commutation of operands should be handled.
  // 2) Matching of shuffles with known shuffle masks to x86 target specific