X86ISelLowering.cpp

/// of specified width.
static SDValue getUnpacklMask(unsigned NumElems, SelectionDAG &DAG,
                              DebugLoc dl) {
  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
  MVT BaseVT = MaskVT.getVectorElementType();
  SmallVector<SDValue, 8> MaskVec;
  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
    MaskVec.push_back(DAG.getConstant(i,            BaseVT));
    MaskVec.push_back(DAG.getConstant(i + NumElems, BaseVT));
  }
  return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
                     &MaskVec[0], MaskVec.size());
}

/// getUnpackhMask - Returns a vector_shuffle mask for an unpackh operation
/// of specified width.
static SDValue getUnpackhMask(unsigned NumElems, SelectionDAG &DAG,
                              DebugLoc dl) {
  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
  MVT BaseVT = MaskVT.getVectorElementType();
  unsigned Half = NumElems/2;
  SmallVector<SDValue, 8> MaskVec;
  for (unsigned i = 0; i != Half; ++i) {
    MaskVec.push_back(DAG.getConstant(i + Half,            BaseVT));
    MaskVec.push_back(DAG.getConstant(i + NumElems + Half, BaseVT));
  }
  return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
                     &MaskVec[0], MaskVec.size());
}

/// getSwapEltZeroMask - Returns a vector_shuffle mask for a shuffle that swaps
/// element #0 of a vector with the specified index, leaving the rest of the
/// elements in place.
static SDValue getSwapEltZeroMask(unsigned NumElems, unsigned DestElt,
                                   SelectionDAG &DAG, DebugLoc dl) {
  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
  MVT BaseVT = MaskVT.getVectorElementType();
  SmallVector<SDValue, 8> MaskVec;
  // Element #0 of the result gets the elt we are replacing.
  MaskVec.push_back(DAG.getConstant(DestElt, BaseVT));
  for (unsigned i = 1; i != NumElems; ++i)
    MaskVec.push_back(DAG.getConstant(i == DestElt ? 0 : i, BaseVT));
  return DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
                     &MaskVec[0], MaskVec.size());
}

/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
static SDValue PromoteSplat(SDValue Op, SelectionDAG &DAG, bool HasSSE2) {
  MVT PVT = HasSSE2 ? MVT::v4i32 : MVT::v4f32;
  MVT VT = Op.getValueType();
  if (PVT == VT)
    return Op;
  SDValue V1 = Op.getOperand(0);
  SDValue Mask = Op.getOperand(2);
  unsigned MaskNumElems = Mask.getNumOperands();
  unsigned NumElems = MaskNumElems;
  DebugLoc dl = Op.getDebugLoc();
  // Special handling of v4f32 -> v4i32.
  if (VT != MVT::v4f32) {
    // Find which element we want to splat.
    SDNode* EltNoNode = getSplatMaskEltNo(Mask.getNode()).getNode();
    unsigned EltNo = cast<ConstantSDNode>(EltNoNode)->getZExtValue();
    // unpack elements to the correct location
    while (NumElems > 4) {
      if (EltNo < NumElems/2) {
        Mask = getUnpacklMask(MaskNumElems, DAG, dl);
      } else {
        Mask = getUnpackhMask(MaskNumElems, DAG, dl);
        EltNo -= NumElems/2;
      }
      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V1, Mask);
      NumElems >>= 1;
    }
    SDValue Cst = DAG.getConstant(EltNo, MVT::i32);
    Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
  }

  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
  SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1,
                                  DAG.getUNDEF(PVT), Mask);
  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle);
}

/// isVectorLoad - Returns true if the node is a vector load, a scalar
/// load that's promoted to vector, or a load bitcasted.
static bool isVectorLoad(SDValue Op) {
  assert(Op.getValueType().isVector() && "Expected a vector type");
  if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR ||
      Op.getOpcode() == ISD::BIT_CONVERT) {
    return isa<LoadSDNode>(Op.getOperand(0));
  }
  return isa<LoadSDNode>(Op);
}


/// CanonicalizeMovddup - Cannonicalize movddup shuffle to v2f64.
///
static SDValue CanonicalizeMovddup(SDValue Op, SDValue V1, SDValue Mask,
                                   SelectionDAG &DAG, bool HasSSE3) {
  // If we have sse3 and shuffle has more than one use or input is a load, then
  // use movddup. Otherwise, use movlhps.
  bool UseMovddup = HasSSE3 && (!Op.hasOneUse() || isVectorLoad(V1));
  MVT PVT = UseMovddup ? MVT::v2f64 : MVT::v4f32;
  MVT VT = Op.getValueType();
  if (VT == PVT)
    return Op;
  DebugLoc dl = Op.getDebugLoc();
  unsigned NumElems = PVT.getVectorNumElements();
  if (NumElems == 2) {
    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
    Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
  } else {
    assert(NumElems == 4);
    SDValue Cst0 = DAG.getTargetConstant(0, MVT::i32);
    SDValue Cst1 = DAG.getTargetConstant(1, MVT::i32);
    Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
                       Cst0, Cst1, Cst0, Cst1);
  }

  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
  SDValue Shuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, PVT, V1,
                                DAG.getUNDEF(PVT), Mask);
  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Shuffle);
}

/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
/// vector of zero or undef vector.  This produces a shuffle where the low
/// element of V2 is swizzled into the zero/undef vector, landing at element
/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                             bool isZero, bool HasSSE2,
                                             SelectionDAG &DAG) {
  DebugLoc dl = V2.getDebugLoc();
  MVT VT = V2.getValueType();
  SDValue V1 = isZero
    ? getZeroVector(VT, HasSSE2, DAG, dl) : DAG.getUNDEF(VT);
  unsigned NumElems = V2.getValueType().getVectorNumElements();
  MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
  MVT EVT = MaskVT.getVectorElementType();
  SmallVector<SDValue, 16> MaskVec;
  for (unsigned i = 0; i != NumElems; ++i)
    if (i == Idx)  // If this is the insertion idx, put the low elt of V2 here.
      MaskVec.push_back(DAG.getConstant(NumElems, EVT));
    else
      MaskVec.push_back(DAG.getConstant(i, EVT));
  SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
                               &MaskVec[0], MaskVec.size());
  return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, Mask);
}

/// getNumOfConsecutiveZeros - Return the number of elements in a result of
/// a shuffle that is zero.
static
unsigned getNumOfConsecutiveZeros(SDValue Op, SDValue Mask,
                                  unsigned NumElems, bool Low,
                                  SelectionDAG &DAG) {
  unsigned NumZeros = 0;
  for (unsigned i = 0; i < NumElems; ++i) {
    unsigned Index = Low ? i : NumElems-i-1;
    SDValue Idx = Mask.getOperand(Index);
    if (Idx.getOpcode() == ISD::UNDEF) {
      ++NumZeros;
      continue;
    }
    SDValue Elt = DAG.getShuffleScalarElt(Op.getNode(), Index);
    if (Elt.getNode() && isZeroNode(Elt))
      ++NumZeros;
    else
      break;
  }
  return NumZeros;
}

/// isVectorShift - Returns true if the shuffle can be implemented as a
/// logical left or right shift of a vector.
static bool isVectorShift(SDValue Op, SDValue Mask, SelectionDAG &DAG,
                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
  unsigned NumElems = Mask.getNumOperands();

  isLeft = true;
  unsigned NumZeros= getNumOfConsecutiveZeros(Op, Mask, NumElems, true, DAG);
  if (!NumZeros) {
    isLeft = false;
    NumZeros = getNumOfConsecutiveZeros(Op, Mask, NumElems, false, DAG);
    if (!NumZeros)
      return false;
  }

  bool SeenV1 = false;
  bool SeenV2 = false;
  for (unsigned i = NumZeros; i < NumElems; ++i) {
    unsigned Val = isLeft ? (i - NumZeros) : i;
    SDValue Idx = Mask.getOperand(isLeft ? i : (i - NumZeros));
    if (Idx.getOpcode() == ISD::UNDEF)
      continue;
    unsigned Index = cast<ConstantSDNode>(Idx)->getZExtValue();
    if (Index < NumElems)
      SeenV1 = true;
    else {
      Index -= NumElems;
      SeenV2 = true;
    }
    if (Index != Val)
      return false;
  }
  if (SeenV1 && SeenV2)
    return false;

  ShVal = SeenV1 ? Op.getOperand(0) : Op.getOperand(1);
  ShAmt = NumZeros;
  return true;
}


/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
///
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                       unsigned NumNonZero, unsigned NumZero,
                                       SelectionDAG &DAG, TargetLowering &TLI) {
  if (NumNonZero > 8)
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();
  SDValue V(0, 0);
  bool First = true;
  for (unsigned i = 0; i < 16; ++i) {
    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
    if (ThisIsNonZero && First) {
      if (NumZero)
        V = getZeroVector(MVT::v8i16, true, DAG, dl);
      else
        V = DAG.getUNDEF(MVT::v8i16);
      First = false;
    }

    if ((i & 1) != 0) {
      SDValue ThisElt(0, 0), LastElt(0, 0);
      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
      if (LastIsNonZero) {
        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
                              MVT::i16, Op.getOperand(i-1));
      }
      if (ThisIsNonZero) {
        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
                              ThisElt, DAG.getConstant(8, MVT::i8));
        if (LastIsNonZero)
          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
      } else
        ThisElt = LastElt;

      if (ThisElt.getNode())
        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
                        DAG.getIntPtrConstant(i/2));
    }
  }

  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
}

/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
///
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                       unsigned NumNonZero, unsigned NumZero,
                                       SelectionDAG &DAG, TargetLowering &TLI) {
  if (NumNonZero > 4)
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();
  SDValue V(0, 0);
  bool First = true;
  for (unsigned i = 0; i < 8; ++i) {
    bool isNonZero = (NonZeros & (1 << i)) != 0;
    if (isNonZero) {
      if (First) {
        if (NumZero)
          V = getZeroVector(MVT::v8i16, true, DAG, dl);
        else
          V = DAG.getUNDEF(MVT::v8i16);
        First = false;
      }
      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
                      MVT::v8i16, V, Op.getOperand(i),
                      DAG.getIntPtrConstant(i));
    }
  }

  return V;
}

/// getVShift - Return a vector logical shift node.
///
static SDValue getVShift(bool isLeft, MVT VT, SDValue SrcOp,
                           unsigned NumBits, SelectionDAG &DAG,
                           const TargetLowering &TLI, DebugLoc dl) {
  bool isMMX = VT.getSizeInBits() == 64;
  MVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
                     DAG.getNode(Opc, dl, ShVT, SrcOp,
                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
}

SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
  DebugLoc dl = Op.getDebugLoc();
  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
  if (ISD::isBuildVectorAllZeros(Op.getNode())
      || ISD::isBuildVectorAllOnes(Op.getNode())) {
    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
    // eliminated on x86-32 hosts.
    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
      return Op;

    if (ISD::isBuildVectorAllOnes(Op.getNode()))
      return getOnesVector(Op.getValueType(), DAG, dl);
    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
  }

  MVT VT = Op.getValueType();
  MVT EVT = VT.getVectorElementType();
  unsigned EVTBits = EVT.getSizeInBits();

  unsigned NumElems = Op.getNumOperands();
  unsigned NumZero  = 0;
  unsigned NumNonZero = 0;
  unsigned NonZeros = 0;
  bool IsAllConstants = true;
  SmallSet<SDValue, 8> Values;
  for (unsigned i = 0; i < NumElems; ++i) {
    SDValue Elt = Op.getOperand(i);
    if (Elt.getOpcode() == ISD::UNDEF)
      continue;
    Values.insert(Elt);
    if (Elt.getOpcode() != ISD::Constant &&
        Elt.getOpcode() != ISD::ConstantFP)
      IsAllConstants = false;
    if (isZeroNode(Elt))
      NumZero++;
    else {
      NonZeros |= (1 << i);
      NumNonZero++;
    }
  }

  if (NumNonZero == 0) {
    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
    return DAG.getUNDEF(VT);
  }

  // Special case for single non-zero, non-undef, element.
  if (NumNonZero == 1 && NumElems <= 4) {
    unsigned Idx = CountTrailingZeros_32(NonZeros);
    SDValue Item = Op.getOperand(Idx);

    // If this is an insertion of an i64 value on x86-32, and if the top bits of
    // the value are obviously zero, truncate the value to i32 and do the
    // insertion that way.  Only do this if the value is non-constant or if the
    // value is a constant being inserted into element 0.  It is cheaper to do
    // a constant pool load than it is to do a movd + shuffle.
    if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
        (!IsAllConstants || Idx == 0)) {
      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
        // Handle MMX and SSE both.
        MVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;

        // Truncate the value (which may itself be a constant) to i32, and
        // convert it to a vector with movd (S2V+shuffle to zero extend).
        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
                                           Subtarget->hasSSE2(), DAG);

        // Now we have our 32-bit value zero extended in the low element of
        // a vector.  If Idx != 0, swizzle it into place.
        if (Idx != 0) {
          SDValue Ops[] = {
            Item, DAG.getUNDEF(Item.getValueType()),
            getSwapEltZeroMask(VecElts, Idx, DAG, dl)
          };
          Item = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VecVT, Ops, 3);
        }
        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
      }
    }

    // If we have a constant or non-constant insertion into the low element of
    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
    // depending on what the source datatype is.  Because we can only get here
    // when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
    if (Idx == 0 &&
        // Don't do this for i64 values on x86-32.
        (EVT != MVT::i64 || Subtarget->is64Bit())) {
      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
      return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
                                         Subtarget->hasSSE2(), DAG);
    }

    // Is it a vector logical left shift?
    if (NumElems == 2 && Idx == 1 &&
        isZeroNode(Op.getOperand(0)) && !isZeroNode(Op.getOperand(1))) {
      unsigned NumBits = VT.getSizeInBits();
      return getVShift(true, VT,
                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                   VT, Op.getOperand(1)),
                       NumBits/2, DAG, *this, dl);
    }

    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
      return SDValue();

    // Otherwise, if this is a vector with i32 or f32 elements, and the element
    // is a non-constant being inserted into an element other than the low one,
    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
    // movd/movss) to move this into the low element, then shuffle it into
    // place.
    if (EVTBits == 32) {
      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

      // Turn it into a shuffle of zero and zero-extended scalar to vector.
      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
                                         Subtarget->hasSSE2(), DAG);
      MVT MaskVT  = MVT::getIntVectorWithNumElements(NumElems);
      MVT MaskEVT = MaskVT.getVectorElementType();
      SmallVector<SDValue, 8> MaskVec;
      for (unsigned i = 0; i < NumElems; i++)
        MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
      SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
                                   &MaskVec[0], MaskVec.size());
      return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, Item,
                         DAG.getUNDEF(VT), Mask);
    }
  }

  // Splat is obviously ok. Let legalizer expand it to a shuffle.
  if (Values.size() == 1)
    return SDValue();

  // A vector full of immediates; various special cases are already
  // handled, so this is best done with a single constant-pool load.
  if (IsAllConstants)
    return SDValue();

  // Let legalizer expand 2-wide build_vectors.
  if (EVTBits == 64) {
    if (NumNonZero == 1) {
      // One half is zero or undef.
      unsigned Idx = CountTrailingZeros_32(NonZeros);
      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                 Op.getOperand(Idx));
      return getShuffleVectorZeroOrUndef(V2, Idx, true,
                                         Subtarget->hasSSE2(), DAG);
    }
    return SDValue();
  }

  // If element VT is < 32 bits, convert it to inserts into a zero vector.
  if (EVTBits == 8 && NumElems == 16) {
    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
                                        *this);
    if (V.getNode()) return V;
  }

  if (EVTBits == 16 && NumElems == 8) {
    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
                                        *this);
    if (V.getNode()) return V;
  }

  // If element VT is == 32 bits, turn it into a number of shuffles.
  SmallVector<SDValue, 8> V;
  V.resize(NumElems);
  if (NumElems == 4 && NumZero > 0) {
    for (unsigned i = 0; i < 4; ++i) {
      bool isZero = !(NonZeros & (1 << i));
      if (isZero)
        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
      else
        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
    }

    for (unsigned i = 0; i < 2; ++i) {
      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
        default: break;
        case 0:
          V[i] = V[i*2];  // Must be a zero vector.
          break;
        case 1:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2+1], V[i*2],
                             getMOVLMask(NumElems, DAG, dl));
          break;
        case 2:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1],
                             getMOVLMask(NumElems, DAG, dl));
          break;
        case 3:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i*2], V[i*2+1],
                             getUnpacklMask(NumElems, DAG, dl));
          break;
      }
    }

    MVT MaskVT = MVT::getIntVectorWithNumElements(NumElems);
    MVT EVT = MaskVT.getVectorElementType();
    SmallVector<SDValue, 8> MaskVec;
    bool Reverse = (NonZeros & 0x3) == 2;
    for (unsigned i = 0; i < 2; ++i)
      if (Reverse)
        MaskVec.push_back(DAG.getConstant(1-i, EVT));
      else
        MaskVec.push_back(DAG.getConstant(i, EVT));
    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
    for (unsigned i = 0; i < 2; ++i)
      if (Reverse)
        MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
      else
        MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
    SDValue ShufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVT,
                                     &MaskVec[0], MaskVec.size());
    return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[0], V[1], ShufMask);
  }

  if (Values.size() > 2) {
    // Expand into a number of unpckl*.
    // e.g. for v4f32
    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
    SDValue UnpckMask = getUnpacklMask(NumElems, DAG, dl);
    for (unsigned i = 0; i < NumElems; ++i)
      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
    NumElems >>= 1;
    while (NumElems != 0) {
      for (unsigned i = 0; i < NumElems; ++i)
        V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V[i], V[i + NumElems],
                           UnpckMask);
      NumElems >>= 1;
    }
    return V[0];
  }

  return SDValue();
}

// v8i16 shuffles - Prefer shuffles in the following order:
// 1. [all]   pshuflw, pshufhw, optional move
// 2. [ssse3] 1 x pshufb
// 3. [ssse3] 2 x pshufb + 1 x por
// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
static
SDValue LowerVECTOR_SHUFFLEv8i16(SDValue V1, SDValue V2,
                                 SDValue PermMask, SelectionDAG &DAG,
                                 X86TargetLowering &TLI, DebugLoc dl) {
  SmallVector<SDValue, 8> MaskElts(PermMask.getNode()->op_begin(),
                                   PermMask.getNode()->op_end());
  SmallVector<int, 8> MaskVals;

  // Determine if more than 1 of the words in each of the low and high quadwords
  // of the result come from the same quadword of one of the two inputs.  Undef
  // mask values count as coming from any quadword, for better codegen.
  SmallVector<unsigned, 4> LoQuad(4);
  SmallVector<unsigned, 4> HiQuad(4);
  BitVector InputQuads(4);
  for (unsigned i = 0; i < 8; ++i) {
    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
    SDValue Elt = MaskElts[i];
    int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : 
                 cast<ConstantSDNode>(Elt)->getZExtValue();
    MaskVals.push_back(EltIdx);
    if (EltIdx < 0) {
      ++Quad[0];
      ++Quad[1];
      ++Quad[2];
      ++Quad[3];
      continue;
    }
    ++Quad[EltIdx / 4];
    InputQuads.set(EltIdx / 4);
  }

  int BestLoQuad = -1;
  unsigned MaxQuad = 1;
  for (unsigned i = 0; i < 4; ++i) {
    if (LoQuad[i] > MaxQuad) {
      BestLoQuad = i;
      MaxQuad = LoQuad[i];
    }
  }

  int BestHiQuad = -1;
  MaxQuad = 1;
  for (unsigned i = 0; i < 4; ++i) {
    if (HiQuad[i] > MaxQuad) {
      BestHiQuad = i;
      MaxQuad = HiQuad[i];
    }
  }

  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
  // of the two input vectors, shuffle them into one input vector so only a 
  // single pshufb instruction is necessary. If There are more than 2 input
  // quads, disable the next transformation since it does not help SSSE3.
  bool V1Used = InputQuads[0] || InputQuads[1];
  bool V2Used = InputQuads[2] || InputQuads[3];
  if (TLI.getSubtarget()->hasSSSE3()) {
    if (InputQuads.count() == 2 && V1Used && V2Used) {
      BestLoQuad = InputQuads.find_first();
      BestHiQuad = InputQuads.find_next(BestLoQuad);
    }
    if (InputQuads.count() > 2) {
      BestLoQuad = -1;
      BestHiQuad = -1;
    }
  }

  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
  // the shuffle mask.  If a quad is scored as -1, that means that it contains
  // words from all 4 input quadwords.
  SDValue NewV;
  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
    SmallVector<SDValue,8> MaskV;
    MaskV.push_back(DAG.getConstant(BestLoQuad < 0 ? 0 : BestLoQuad, MVT::i64));
    MaskV.push_back(DAG.getConstant(BestHiQuad < 0 ? 1 : BestHiQuad, MVT::i64));
    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, &MaskV[0], 2);
    
    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2i64,
                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), Mask);
    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);

    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
    // source words for the shuffle, to aid later transformations.
    bool AllWordsInNewV = true;
    bool InOrder[2] = { true, true };
    for (unsigned i = 0; i != 8; ++i) {
      int idx = MaskVals[i];
      if (idx != (int)i)
        InOrder[i/4] = false;
      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
        continue;
      AllWordsInNewV = false;
      break;
    }

    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
    if (AllWordsInNewV) {
      for (int i = 0; i != 8; ++i) {
        int idx = MaskVals[i];
        if (idx < 0)
          continue;
        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; 
        if ((idx != i) && idx < 4)
          pshufhw = false;
        if ((idx != i) && idx > 3)
          pshuflw = false;
      }
      V1 = NewV;
      V2Used = false;
      BestLoQuad = 0;
      BestHiQuad = 1;
    }

    // If we've eliminated the use of V2, and the new mask is a pshuflw or
    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
      MaskV.clear();
      for (unsigned i = 0; i != 8; ++i)
        MaskV.push_back((MaskVals[i] < 0) ? DAG.getUNDEF(MVT::i16)
                                          : DAG.getConstant(MaskVals[i],
                                                            MVT::i16));
      return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV, 
                         DAG.getUNDEF(MVT::v8i16), 
                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16,
                                     &MaskV[0], 8));
    }
  }
  
  // If we have SSSE3, and all words of the result are from 1 input vector,
  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
  // is present, fall back to case 4.
  if (TLI.getSubtarget()->hasSSSE3()) {
    SmallVector<SDValue,16> pshufbMask;
    
    // If we have elements from both input vectors, set the high bit of the
    // shuffle mask element to zero out elements that come from V2 in the V1 
    // mask, and elements that come from V1 in the V2 mask, so that the two
    // results can be OR'd together.
    bool TwoInputs = V1Used && V2Used;
    for (unsigned i = 0; i != 8; ++i) {
      int EltIdx = MaskVals[i] * 2;
      if (TwoInputs && (EltIdx >= 16)) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
    }
    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, 
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    if (!TwoInputs)
      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
    
    // Calculate the shuffle mask for the second input, shuffle it, and
    // OR it with the first shuffled input.
    pshufbMask.clear();
    for (unsigned i = 0; i != 8; ++i) {
      int EltIdx = MaskVals[i] * 2;
      if (EltIdx < 16) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
    }
    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, 
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
  }

  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
  // and update MaskVals with new element order.
  BitVector InOrder(8);
  if (BestLoQuad >= 0) {
    SmallVector<SDValue, 8> MaskV;
    for (int i = 0; i != 4; ++i) {
      int idx = MaskVals[i];
      if (idx < 0) {
        MaskV.push_back(DAG.getUNDEF(MVT::i16));
        InOrder.set(i);
      } else if ((idx / 4) == BestLoQuad) {
        MaskV.push_back(DAG.getConstant(idx & 3, MVT::i16));
        InOrder.set(i);
      } else {
        MaskV.push_back(DAG.getUNDEF(MVT::i16));
      }
    }
    for (unsigned i = 4; i != 8; ++i)
      MaskV.push_back(DAG.getConstant(i, MVT::i16));
    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV,
                       DAG.getUNDEF(MVT::v8i16),
                       DAG.getNode(ISD::BUILD_VECTOR, dl,
                                   MVT::v8i16, &MaskV[0], 8));
  }
  
  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
  // and update MaskVals with the new element order.
  if (BestHiQuad >= 0) {
    SmallVector<SDValue, 8> MaskV;
    for (unsigned i = 0; i != 4; ++i)
      MaskV.push_back(DAG.getConstant(i, MVT::i16));
    for (unsigned i = 4; i != 8; ++i) {
      int idx = MaskVals[i];
      if (idx < 0) {
        MaskV.push_back(DAG.getUNDEF(MVT::i16));
        InOrder.set(i);
      } else if ((idx / 4) == BestHiQuad) {
        MaskV.push_back(DAG.getConstant((idx & 3) + 4, MVT::i16));
        InOrder.set(i);
      } else {
        MaskV.push_back(DAG.getUNDEF(MVT::i16));
      }
    }
    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v8i16, NewV,
                       DAG.getUNDEF(MVT::v8i16),
                       DAG.getNode(ISD::BUILD_VECTOR, dl,
                                   MVT::v8i16, &MaskV[0], 8));
  }
  
  // In case BestHi & BestLo were both -1, which means each quadword has a word
  // from each of the four input quadwords, calculate the InOrder bitvector now
  // before falling through to the insert/extract cleanup.
  if (BestLoQuad == -1 && BestHiQuad == -1) {
    NewV = V1;
    for (int i = 0; i != 8; ++i)
      if (MaskVals[i] < 0 || MaskVals[i] == i)
        InOrder.set(i);
  }
  
  // The other elements are put in the right place using pextrw and pinsrw.
  for (unsigned i = 0; i != 8; ++i) {
    if (InOrder[i])
      continue;
    int EltIdx = MaskVals[i];
    if (EltIdx < 0)
      continue;
    SDValue ExtOp = (EltIdx < 8)
    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
                  DAG.getIntPtrConstant(EltIdx))
    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
                  DAG.getIntPtrConstant(EltIdx - 8));
    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
                       DAG.getIntPtrConstant(i));
  }
  return NewV;
}

// v16i8 shuffles - Prefer shuffles in the following order:
// 1. [ssse3] 1 x pshufb
// 2. [ssse3] 2 x pshufb + 1 x por
// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
static
SDValue LowerVECTOR_SHUFFLEv16i8(SDValue V1, SDValue V2,
                                 SDValue PermMask, SelectionDAG &DAG,
                                 X86TargetLowering &TLI, DebugLoc dl) {
  SmallVector<SDValue, 16> MaskElts(PermMask.getNode()->op_begin(),
                                    PermMask.getNode()->op_end());
  SmallVector<int, 16> MaskVals;
  
  // If we have SSSE3, case 1 is generated when all result bytes come from
  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is 
  // present, fall back to case 3.
  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
  bool V1Only = true;
  bool V2Only = true;
  for (unsigned i = 0; i < 16; ++i) {
    SDValue Elt = MaskElts[i];
    int EltIdx = Elt.getOpcode() == ISD::UNDEF ? -1 : 
                 cast<ConstantSDNode>(Elt)->getZExtValue();
    MaskVals.push_back(EltIdx);
    if (EltIdx < 0)
      continue;
    if (EltIdx < 16)
      V2Only = false;
    else
      V1Only = false;
  }
  
  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
  if (TLI.getSubtarget()->hasSSSE3()) {
    SmallVector<SDValue,16> pshufbMask;
    
    // If all result elements are from one input vector, then only translate
    // undef mask values to 0x80 (zero out result) in the pshufb mask. 
    //
    // Otherwise, we have elements from both input vectors, and must zero out
    // elements that come from V2 in the first mask, and V1 in the second mask
    // so that we can OR them together.
    bool TwoInputs = !(V1Only || V2Only);
    for (unsigned i = 0; i != 16; ++i) {
      int EltIdx = MaskVals[i];
      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
    }
    // If all the elements are from V2, assign it to V1 and return after
    // building the first pshufb.
    if (V2Only)
      V1 = V2;
    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    if (!TwoInputs)
      return V1;
    
    // Calculate the shuffle mask for the second input, shuffle it, and
    // OR it with the first shuffled input.
    pshufbMask.clear();
    for (unsigned i = 0; i != 16; ++i) {
      int EltIdx = MaskVals[i];
      if (EltIdx < 16) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
    }
    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
  }
  
  // No SSSE3 - Calculate in place words and then fix all out of place words
  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
  // the 16 different words that comprise the two doublequadword input vectors.
  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
  SDValue NewV = V2Only ? V2 : V1;
  for (int i = 0; i != 8; ++i) {
    int Elt0 = MaskVals[i*2];
    int Elt1 = MaskVals[i*2+1];
    
    // This word of the result is all undef, skip it.
    if (Elt0 < 0 && Elt1 < 0)
      continue;
    
    // This word of the result is already in the correct place, skip it.
    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
      continue;
    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
      continue;
    
    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
    SDValue InsElt;

    // If Elt0 and Elt1 are defined, are consecutive, and can be load
    // using a single extract together, load it and store it.
    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
                           DAG.getIntPtrConstant(Elt1 / 2));
      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
                        DAG.getIntPtrConstant(i));
      continue;
    }

    // If Elt1 is defined, extract it from the appropriate source.  If the
    // source byte is not also odd, shift the extracted word left 8 bits
    // otherwise clear the bottom 8 bits if we need to do an or.
    if (Elt1 >= 0) {
      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
                           DAG.getIntPtrConstant(Elt1 / 2));
      if ((Elt1 & 1) == 0)
        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
                             DAG.getConstant(8, TLI.getShiftAmountTy()));
      else if (Elt0 >= 0)
        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
                             DAG.getConstant(0xFF00, MVT::i16));
    }
    // If Elt0 is defined, extract it from the appropriate source.  If the
    // source byte is not also even, shift the extracted word right 8 bits. If
    // Elt1 was also defined, OR the extracted values together before
    // inserting them in the result.
    if (Elt0 >= 0) {
      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
      if ((Elt0 & 1) != 0)
        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
                              DAG.getConstant(8, TLI.getShiftAmountTy()));
      else if (Elt1 >= 0)
        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
                             DAG.getConstant(0x00FF, MVT::i16));
      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
                         : InsElt0;
    }
    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
                       DAG.getIntPtrConstant(i));
  }
  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
}

/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
/// done when every pair / quad of shuffle mask elements point to elements in
/// the right sequence. e.g.
/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
static
SDValue RewriteAsNarrowerShuffle(SDValue V1, SDValue V2,
                                MVT VT,
                                SDValue PermMask, SelectionDAG &DAG,
                                TargetLowering &TLI, DebugLoc dl) {
  unsigned NumElems = PermMask.getNumOperands();
  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
  MVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
  MVT MaskEltVT = MaskVT.getVectorElementType();
  MVT NewVT = MaskVT;
  switch (VT.getSimpleVT()) {
  default: assert(false && "Unexpected!");
  case MVT::v4f32: NewVT = MVT::v2f64; break;
  case MVT::v4i32: NewVT = MVT::v2i64; break;
  case MVT::v8i16: NewVT = MVT::v4i32; break;
  case MVT::v16i8: NewVT = MVT::v4i32; break;
  }

  if (NewWidth == 2) {
    if (VT.isInteger())
      NewVT = MVT::v2i64;
    else
      NewVT = MVT::v2f64;
  }
  unsigned Scale = NumElems / NewWidth;
  SmallVector<SDValue, 8> MaskVec;
  for (unsigned i = 0; i < NumElems; i += Scale) {
    unsigned StartIdx = ~0U;
    for (unsigned j = 0; j < Scale; ++j) {
      SDValue Elt = PermMask.getOperand(i+j);
      if (Elt.getOpcode() == ISD::UNDEF)
        continue;
      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getZExtValue();
      if (StartIdx == ~0U)
        StartIdx = EltIdx - (EltIdx % Scale);
      if (EltIdx != StartIdx + j)
        return SDValue();
    }
    if (StartIdx == ~0U)
      MaskVec.push_back(DAG.getUNDEF(MaskEltVT));
    else