X86ISelLowering.cpp

/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
  unsigned NumElems = N->getValueType(0).getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;

  for (unsigned i = 0; i < NumElems/2; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
      return false;

  for (unsigned i = NumElems/2; i < NumElems; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i), i))
      return false;

  return true;
}

/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
  unsigned NumElems = N->getValueType(0).getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;

  for (unsigned i = 0; i < NumElems/2; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i), i))
      return false;

  for (unsigned i = 0; i < NumElems/2; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
      return false;

  return true;
}

/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                         bool V2IsSplat = false) {
  int NumElts = VT.getVectorNumElements();
  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
    return false;

  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
    int BitI  = Mask[i];
    int BitI1 = Mask[i+1];
    if (!isUndefOrEqual(BitI, j))
      return false;
    if (V2IsSplat) {
      if (!isUndefOrEqual(BitI1, NumElts))
        return false;
    } else {
      if (!isUndefOrEqual(BitI1, j + NumElts))
        return false;
    }
  }
  return true;
}

bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
}

/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKH.
static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
                         bool V2IsSplat = false) {
  int NumElts = VT.getVectorNumElements();
  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
    return false;

  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
    int BitI  = Mask[i];
    int BitI1 = Mask[i+1];
    if (!isUndefOrEqual(BitI, j + NumElts/2))
      return false;
    if (V2IsSplat) {
      if (isUndefOrEqual(BitI1, NumElts))
        return false;
    } else {
      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
        return false;
    }
  }
  return true;
}

bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
}

/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
/// <0, 0, 1, 1>
static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
  int NumElems = VT.getVectorNumElements();
  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
    int BitI  = Mask[i];
    int BitI1 = Mask[i+1];
    if (!isUndefOrEqual(BitI, j))
      return false;
    if (!isUndefOrEqual(BitI1, j))
      return false;
  }
  return true;
}

bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
}

/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
/// <2, 2, 3, 3>
static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
  int NumElems = VT.getVectorNumElements();
  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
    int BitI  = Mask[i];
    int BitI1 = Mask[i+1];
    if (!isUndefOrEqual(BitI, j))
      return false;
    if (!isUndefOrEqual(BitI1, j))
      return false;
  }
  return true;
}

bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
}

/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  if (VT.getVectorElementType().getSizeInBits() < 32)
    return false;

  int NumElts = VT.getVectorNumElements();

  if (!isUndefOrEqual(Mask[0], NumElts))
    return false;

  for (int i = 1; i < NumElts; ++i)
    if (!isUndefOrEqual(Mask[i], i))
      return false;

  return true;
}

bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isMOVLMask(M, N->getValueType(0));
}

/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                               bool V2IsSplat = false, bool V2IsUndef = false) {
  int NumOps = VT.getVectorNumElements();
  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
    return false;

  if (!isUndefOrEqual(Mask[0], 0))
    return false;

  for (int i = 1; i < NumOps; ++i)
    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
      return false;

  return true;
}

static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
                           bool V2IsUndef = false) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
}

/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
  if (N->getValueType(0).getVectorNumElements() != 4)
    return false;

  // Expect 1, 1, 3, 3
  for (unsigned i = 0; i < 2; ++i) {
    int Elt = N->getMaskElt(i);
    if (Elt >= 0 && Elt != 1)
      return false;
  }

  bool HasHi = false;
  for (unsigned i = 2; i < 4; ++i) {
    int Elt = N->getMaskElt(i);
    if (Elt >= 0 && Elt != 3)
      return false;
    if (Elt == 3)
      HasHi = true;
  }
  // Don't use movshdup if it can be done with a shufps.
  // FIXME: verify that matching u, u, 3, 3 is what we want.
  return HasHi;
}

/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
  if (N->getValueType(0).getVectorNumElements() != 4)
    return false;

  // Expect 0, 0, 2, 2
  for (unsigned i = 0; i < 2; ++i)
    if (N->getMaskElt(i) > 0)
      return false;

  bool HasHi = false;
  for (unsigned i = 2; i < 4; ++i) {
    int Elt = N->getMaskElt(i);
    if (Elt >= 0 && Elt != 2)
      return false;
    if (Elt == 2)
      HasHi = true;
  }
  // Don't use movsldup if it can be done with a shufps.
  return HasHi;
}

/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
  int e = N->getValueType(0).getVectorNumElements() / 2;

  for (int i = 0; i < e; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i), i))
      return false;
  for (int i = 0; i < e; ++i)
    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
      return false;
  return true;
}

/// isVEXTRACTF128Index - Return true if the specified
/// EXTRACT_SUBVECTOR operand specifies a vector extract that is
/// suitable for input to VEXTRACTF128.
bool X86::isVEXTRACTF128Index(SDNode *N) {
  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
    return false;

  // The index should be aligned on a 128-bit boundary.
  uint64_t Index =
    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();

  unsigned VL = N->getValueType(0).getVectorNumElements();
  unsigned VBits = N->getValueType(0).getSizeInBits();
  unsigned ElSize = VBits / VL;
  bool Result = (Index * ElSize) % 128 == 0;

  return Result;
}

/// isVINSERTF128Index - Return true if the specified INSERT_SUBVECTOR
/// operand specifies a subvector insert that is suitable for input to
/// VINSERTF128.
bool X86::isVINSERTF128Index(SDNode *N) {
  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
    return false;

  // The index should be aligned on a 128-bit boundary.
  uint64_t Index =
    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();

  unsigned VL = N->getValueType(0).getVectorNumElements();
  unsigned VBits = N->getValueType(0).getSizeInBits();
  unsigned ElSize = VBits / VL;
  bool Result = (Index * ElSize) % 128 == 0;

  return Result;
}

/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  int NumOperands = SVOp->getValueType(0).getVectorNumElements();

  unsigned Shift = (NumOperands == 4) ? 2 : 1;
  unsigned Mask = 0;
  for (int i = 0; i < NumOperands; ++i) {
    int Val = SVOp->getMaskElt(NumOperands-i-1);
    if (Val < 0) Val = 0;
    if (Val >= NumOperands) Val -= NumOperands;
    Mask |= Val;
    if (i != NumOperands - 1)
      Mask <<= Shift;
  }
  return Mask;
}

/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  unsigned Mask = 0;
  // 8 nodes, but we only care about the last 4.
  for (unsigned i = 7; i >= 4; --i) {
    int Val = SVOp->getMaskElt(i);
    if (Val >= 0)
      Mask |= (Val - 4);
    if (i != 4)
      Mask <<= 2;
  }
  return Mask;
}

/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  unsigned Mask = 0;
  // 8 nodes, but we only care about the first 4.
  for (int i = 3; i >= 0; --i) {
    int Val = SVOp->getMaskElt(i);
    if (Val >= 0)
      Mask |= Val;
    if (i != 0)
      Mask <<= 2;
  }
  return Mask;
}

/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  EVT VVT = N->getValueType(0);
  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
  int Val = 0;

  unsigned i, e;
  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
    Val = SVOp->getMaskElt(i);
    if (Val >= 0)
      break;
  }
  return (Val - i) * EltSize;
}

/// getExtractVEXTRACTF128Immediate - Return the appropriate immediate
/// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
/// instructions.
unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
  if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
    llvm_unreachable("Illegal extract subvector for VEXTRACTF128");

  uint64_t Index =
    cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();

  EVT VecVT = N->getOperand(0).getValueType();
  EVT ElVT = VecVT.getVectorElementType();

  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();

  return Index / NumElemsPerChunk;
}

/// getInsertVINSERTF128Immediate - Return the appropriate immediate
/// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
/// instructions.
unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
  if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
    llvm_unreachable("Illegal insert subvector for VINSERTF128");

  uint64_t Index =
    cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();  

  EVT VecVT = N->getValueType(0);
  EVT ElVT = VecVT.getVectorElementType();

  unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();

  return Index / NumElemsPerChunk;
}

/// isZeroNode - Returns true if Elt is a constant zero or a floating point
/// constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
  return ((isa<ConstantSDNode>(Elt) &&
           cast<ConstantSDNode>(Elt)->isNullValue()) ||
          (isa<ConstantFPSDNode>(Elt) &&
           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
}

/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
/// their permute mask.
static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                                    SelectionDAG &DAG) {
  EVT VT = SVOp->getValueType(0);
  unsigned NumElems = VT.getVectorNumElements();
  SmallVector<int, 8> MaskVec;

  for (unsigned i = 0; i != NumElems; ++i) {
    int idx = SVOp->getMaskElt(i);
    if (idx < 0)
      MaskVec.push_back(idx);
    else if (idx < (int)NumElems)
      MaskVec.push_back(idx + NumElems);
    else
      MaskVec.push_back(idx - NumElems);
  }
  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
                              SVOp->getOperand(0), &MaskVec[0]);
}

/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
/// the two vector operands have swapped position.
static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
  unsigned NumElems = VT.getVectorNumElements();
  for (unsigned i = 0; i != NumElems; ++i) {
    int idx = Mask[i];
    if (idx < 0)
      continue;
    else if (idx < (int)NumElems)
      Mask[i] = idx + NumElems;
    else
      Mask[i] = idx - NumElems;
  }
}

/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
/// match movhlps. The lower half elements should come from upper half of
/// V1 (and in order), and the upper half elements should come from the upper
/// half of V2 (and in order).
static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
  if (Op->getValueType(0).getVectorNumElements() != 4)
    return false;
  for (unsigned i = 0, e = 2; i != e; ++i)
    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
      return false;
  for (unsigned i = 2; i != 4; ++i)
    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
      return false;
  return true;
}

/// isScalarLoadToVector - Returns true if the node is a scalar load that
/// is promoted to a vector. It also returns the LoadSDNode by reference if
/// required.
static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
    return false;
  N = N->getOperand(0).getNode();
  if (!ISD::isNON_EXTLoad(N))
    return false;
  if (LD)
    *LD = cast<LoadSDNode>(N);
  return true;
}

/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
/// match movlp{s|d}. The lower half elements should come from lower half of
/// V1 (and in order), and the upper half elements should come from the upper
/// half of V2 (and in order). And since V1 will become the source of the
/// MOVLP, it must be either a vector load or a scalar load to vector.
static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
                               ShuffleVectorSDNode *Op) {
  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
    return false;
  // Is V2 is a vector load, don't do this transformation. We will try to use
  // load folding shufps op.
  if (ISD::isNON_EXTLoad(V2))
    return false;

  unsigned NumElems = Op->getValueType(0).getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;
  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
    if (!isUndefOrEqual(Op->getMaskElt(i), i))
      return false;
  for (unsigned i = NumElems/2; i != NumElems; ++i)
    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
      return false;
  return true;
}

/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
/// all the same.
static bool isSplatVector(SDNode *N) {
  if (N->getOpcode() != ISD::BUILD_VECTOR)
    return false;

  SDValue SplatValue = N->getOperand(0);
  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
    if (N->getOperand(i) != SplatValue)
      return false;
  return true;
}

/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
/// to an zero vector.
/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
static bool isZeroShuffle(ShuffleVectorSDNode *N) {
  SDValue V1 = N->getOperand(0);
  SDValue V2 = N->getOperand(1);
  unsigned NumElems = N->getValueType(0).getVectorNumElements();
  for (unsigned i = 0; i != NumElems; ++i) {
    int Idx = N->getMaskElt(i);
    if (Idx >= (int)NumElems) {
      unsigned Opc = V2.getOpcode();
      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
        continue;
      if (Opc != ISD::BUILD_VECTOR ||
          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
        return false;
    } else if (Idx >= 0) {
      unsigned Opc = V1.getOpcode();
      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
        continue;
      if (Opc != ISD::BUILD_VECTOR ||
          !X86::isZeroNode(V1.getOperand(Idx)))
        return false;
    }
  }
  return true;
}

/// getZeroVector - Returns a vector of specified type with all zero elements.
///
static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
                             DebugLoc dl) {
  assert(VT.isVector() && "Expected a vector type");

  // Always build SSE zero vectors as <4 x i32> bitcasted
  // to their dest type. This ensures they get CSE'd.
  SDValue Vec;
  if (VT.getSizeInBits() == 128) {  // SSE
    if (HasSSE2) {  // SSE2
      SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
    } else { // SSE1
      SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
    }
  } else if (VT.getSizeInBits() == 256) { // AVX
    // 256-bit logic and arithmetic instructions in AVX are
    // all floating-point, no support for integer ops. Default
    // to emitting fp zeroed vectors then.
    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops, 8);
  }
  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}

/// getOnesVector - Returns a vector of specified type with all bits set.
///
static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
  assert(VT.isVector() && "Expected a vector type");

  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
  // type.  This ensures they get CSE'd.
  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
  SDValue Vec;
  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
  return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
}


/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
/// that point to V2 points to its first element.
static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
  EVT VT = SVOp->getValueType(0);
  unsigned NumElems = VT.getVectorNumElements();

  bool Changed = false;
  SmallVector<int, 8> MaskVec;
  SVOp->getMask(MaskVec);

  for (unsigned i = 0; i != NumElems; ++i) {
    if (MaskVec[i] > (int)NumElems) {
      MaskVec[i] = NumElems;
      Changed = true;
    }
  }
  if (Changed)
    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
                                SVOp->getOperand(1), &MaskVec[0]);
  return SDValue(SVOp, 0);
}

/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
/// operation of specified width.
static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                       SDValue V2) {
  unsigned NumElems = VT.getVectorNumElements();
  SmallVector<int, 8> Mask;
  Mask.push_back(NumElems);
  for (unsigned i = 1; i != NumElems; ++i)
    Mask.push_back(i);
  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}

/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                          SDValue V2) {
  unsigned NumElems = VT.getVectorNumElements();
  SmallVector<int, 8> Mask;
  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
    Mask.push_back(i);
    Mask.push_back(i + NumElems);
  }
  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}

/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                          SDValue V2) {
  unsigned NumElems = VT.getVectorNumElements();
  unsigned Half = NumElems/2;
  SmallVector<int, 8> Mask;
  for (unsigned i = 0; i != Half; ++i) {
    Mask.push_back(i + Half);
    Mask.push_back(i + NumElems + Half);
  }
  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}

/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32.
static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
  EVT PVT = MVT::v4f32;
  EVT VT = SV->getValueType(0);
  DebugLoc dl = SV->getDebugLoc();
  SDValue V1 = SV->getOperand(0);
  int NumElems = VT.getVectorNumElements();
  int EltNo = SV->getSplatIndex();

  // unpack elements to the correct location
  while (NumElems > 4) {
    if (EltNo < NumElems/2) {
      V1 = getUnpackl(DAG, dl, VT, V1, V1);
    } else {
      V1 = getUnpackh(DAG, dl, VT, V1, V1);
      EltNo -= NumElems/2;
    }
    NumElems >>= 1;
  }

  // Perform the splat.
  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
  V1 = DAG.getNode(ISD::BITCAST, dl, PVT, V1);
  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
  return DAG.getNode(ISD::BITCAST, dl, VT, V1);
}

/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
/// vector of zero or undef vector.  This produces a shuffle where the low
/// element of V2 is swizzled into the zero/undef vector, landing at element
/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                             bool isZero, bool HasSSE2,
                                             SelectionDAG &DAG) {
  EVT VT = V2.getValueType();
  SDValue V1 = isZero
    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
  unsigned NumElems = VT.getVectorNumElements();
  SmallVector<int, 16> MaskVec;
  for (unsigned i = 0; i != NumElems; ++i)
    // If this is the insertion idx, put the low elt of V2 here.
    MaskVec.push_back(i == Idx ? NumElems : i);
  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
}

/// getShuffleScalarElt - Returns the scalar element that will make up the ith
/// element of the result of the vector shuffle.
SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
                            unsigned Depth) {
  if (Depth == 6)
    return SDValue();  // Limit search depth.

  SDValue V = SDValue(N, 0);
  EVT VT = V.getValueType();
  unsigned Opcode = V.getOpcode();

  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
    Index = SV->getMaskElt(Index);

    if (Index < 0)
      return DAG.getUNDEF(VT.getVectorElementType());

    int NumElems = VT.getVectorNumElements();
    SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
  }

  // Recurse into target specific vector shuffles to find scalars.
  if (isTargetShuffle(Opcode)) {
    int NumElems = VT.getVectorNumElements();
    SmallVector<unsigned, 16> ShuffleMask;
    SDValue ImmN;

    switch(Opcode) {
    case X86ISD::SHUFPS:
    case X86ISD::SHUFPD:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeSHUFPSMask(NumElems,
                       cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       ShuffleMask);
      break;
    case X86ISD::PUNPCKHBW:
    case X86ISD::PUNPCKHWD:
    case X86ISD::PUNPCKHDQ:
    case X86ISD::PUNPCKHQDQ:
      DecodePUNPCKHMask(NumElems, ShuffleMask);
      break;
    case X86ISD::UNPCKHPS:
    case X86ISD::UNPCKHPD:
      DecodeUNPCKHPMask(NumElems, ShuffleMask);
      break;
    case X86ISD::PUNPCKLBW:
    case X86ISD::PUNPCKLWD:
    case X86ISD::PUNPCKLDQ:
    case X86ISD::PUNPCKLQDQ:
      DecodePUNPCKLMask(NumElems, ShuffleMask);
      break;
    case X86ISD::UNPCKLPS:
    case X86ISD::UNPCKLPD:
      DecodeUNPCKLPMask(NumElems, ShuffleMask);
      break;
    case X86ISD::MOVHLPS:
      DecodeMOVHLPSMask(NumElems, ShuffleMask);
      break;
    case X86ISD::MOVLHPS:
      DecodeMOVLHPSMask(NumElems, ShuffleMask);
      break;
    case X86ISD::PSHUFD:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodePSHUFMask(NumElems,
                      cast<ConstantSDNode>(ImmN)->getZExtValue(),
                      ShuffleMask);
      break;
    case X86ISD::PSHUFHW:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
      break;
    case X86ISD::PSHUFLW:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
      break;
    case X86ISD::MOVSS:
    case X86ISD::MOVSD: {
      // The index 0 always comes from the first element of the second source,
      // this is why MOVSS and MOVSD are used in the first place. The other
      // elements come from the other positions of the first source vector.
      unsigned OpNum = (Index == 0) ? 1 : 0;
      return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                 Depth+1);
    }
    default:
      assert("not implemented for target shuffle node");
      return SDValue();
    }

    Index = ShuffleMask[Index];
    if (Index < 0)
      return DAG.getUNDEF(VT.getVectorElementType());

    SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
                               Depth+1);
  }

  // Actual nodes that may contain scalar elements
  if (Opcode == ISD::BITCAST) {
    V = V.getOperand(0);
    EVT SrcVT = V.getValueType();
    unsigned NumElems = VT.getVectorNumElements();

    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
      return SDValue();
  }

  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
    return (Index == 0) ? V.getOperand(0)
                          : DAG.getUNDEF(VT.getVectorElementType());

  if (V.getOpcode() == ISD::BUILD_VECTOR)
    return V.getOperand(Index);

  return SDValue();
}

/// getNumOfConsecutiveZeros - Return the number of elements of a vector
/// shuffle operation which come from a consecutively from a zero. The
/// search can start in two diferent directions, from left or right.
static
unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
                                  bool ZerosFromLeft, SelectionDAG &DAG) {
  int i = 0;

  while (i < NumElems) {
    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
    SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0);
    if (!(Elt.getNode() &&
         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
      break;
    ++i;
  }

  return i;
}

/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to
/// MaskE correspond consecutively to elements from one of the vector operands,
/// starting from its index OpIdx. Also tell OpNum which source vector operand.
static
bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE,
                              int OpIdx, int NumElems, unsigned &OpNum) {
  bool SeenV1 = false;
  bool SeenV2 = false;

  for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) {
    int Idx = SVOp->getMaskElt(i);
    // Ignore undef indicies
    if (Idx < 0)
      continue;

    if (Idx < NumElems)
      SeenV1 = true;
    else
      SeenV2 = true;

    // Only accept consecutive elements from the same vector
    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
      return false;
  }

  OpNum = SeenV1 ? 0 : 1;
  return true;
}

/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
/// logical left shift of a vector.
static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
              false /* check zeros from right */, DAG);
  unsigned OpSrc;

  if (!NumZeros)
    return false;

  // Considering the elements in the mask that are not consecutive zeros,
  // check if they consecutively come from only one of the source vectors.
  //
  //               V1 = {X, A, B, C}     0
  //                         \  \  \    /
  //   vector_shuffle V1, V2 <1, 2, 3, X>
  //
  if (!isShuffleMaskConsecutive(SVOp,
            0,                   // Mask Start Index
            NumElems-NumZeros-1, // Mask End Index
            NumZeros,            // Where to start looking in the src vector
            NumElems,            // Number of elements in vector
            OpSrc))              // Which source operand ?
    return false;

  isLeft = false;
  ShAmt = NumZeros;
  ShVal = SVOp->getOperand(OpSrc);
  return true;
}

/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
/// logical left shift of a vector.
static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
              true /* check zeros from left */, DAG);
  unsigned OpSrc;

  if (!NumZeros)
    return false;

  // Considering the elements in the mask that are not consecutive zeros,
  // check if they consecutively come from only one of the source vectors.
  //
  //                           0    { A, B, X, X } = V2
  //                          / \    /  /
  //   vector_shuffle V1, V2 <X, X, 4, 5>
  //
  if (!isShuffleMaskConsecutive(SVOp,
            NumZeros,     // Mask Start Index
            NumElems-1,   // Mask End Index
            0,            // Where to start looking in the src vector
            NumElems,     // Number of elements in vector
            OpSrc))       // Which source operand ?
    return false;

  isLeft = true;
  ShAmt = NumZeros;
  ShVal = SVOp->getOperand(OpSrc);
  return true;
}

/// isVectorShift - Returns true if the shuffle can be implemented as a
/// logical left or right shift of a vector.
static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
    return true;

  return false;
}

/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
///
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                       unsigned NumNonZero, unsigned NumZero,
                                       SelectionDAG &DAG,
                                       const TargetLowering &TLI) {
  if (NumNonZero > 8)
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();
  SDValue V(0, 0);
  bool First = true;
  for (unsigned i = 0; i < 16; ++i) {
    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
    if (ThisIsNonZero && First) {
      if (NumZero)
        V = getZeroVector(MVT::v8i16, true, DAG, dl);
      else
        V = DAG.getUNDEF(MVT::v8i16);
      First = false;
    }

    if ((i & 1) != 0) {
      SDValue ThisElt(0, 0), LastElt(0, 0);
      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
      if (LastIsNonZero) {
        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
                              MVT::i16, Op.getOperand(i-1));
      }
      if (ThisIsNonZero) {
        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
                              ThisElt, DAG.getConstant(8, MVT::i8));
        if (LastIsNonZero)
          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
      } else
        ThisElt = LastElt;

      if (ThisElt.getNode())
        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
                        DAG.getIntPtrConstant(i/2));
    }
  }

  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
}

/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
///
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                     unsigned NumNonZero, unsigned NumZero,
                                     SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
  if (NumNonZero > 4)
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();