X86ISelLowering.cpp

static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                          SDValue V2) {
  unsigned NumElems = VT.getVectorNumElements();
  SmallVector<int, 8> Mask;
  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
    Mask.push_back(i);
    Mask.push_back(i + NumElems);
  }
  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}

/// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
                          SDValue V2) {
  unsigned NumElems = VT.getVectorNumElements();
  unsigned Half = NumElems/2;
  SmallVector<int, 8> Mask;
  for (unsigned i = 0; i != Half; ++i) {
    Mask.push_back(i + Half);
    Mask.push_back(i + NumElems + Half);
  }
  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
}

// PromoteSplatv8v16 - All i16 and i8 vector types can't be used directly by
// a generic shuffle instruction because the target has no such instructions.
// Generate shuffles which repeat i16 and i8 several times until they can be
// represented by v4f32 and then be manipulated by target suported shuffles.
static SDValue PromoteSplatv8v16(SDValue V, SelectionDAG &DAG, int &EltNo) {
  EVT VT = V.getValueType();
  int NumElems = VT.getVectorNumElements();
  DebugLoc dl = V.getDebugLoc();

  while (NumElems > 4) {
    if (EltNo < NumElems/2) {
      V = getUnpackl(DAG, dl, VT, V, V);
    } else {
      V = getUnpackh(DAG, dl, VT, V, V);
      EltNo -= NumElems/2;
    }
    NumElems >>= 1;
  }
  return V;
}

/// getLegalSplat - Generate a legal splat with supported x86 shuffles
static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
  EVT VT = V.getValueType();
  DebugLoc dl = V.getDebugLoc();
  assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
         && "Vector size not supported");

  bool Is128 = VT.getSizeInBits() == 128;
  EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32;
  V = DAG.getNode(ISD::BITCAST, dl, NVT, V);

  if (Is128) {
    int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
    V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
  } else {
    // The second half of indicies refer to the higher part, which is a
    // duplication of the lower one. This makes this shuffle a perfect match
    // for the VPERM instruction.
    int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
                         EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
    V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
  }

  return DAG.getNode(ISD::BITCAST, dl, VT, V);
}

/// PromoteVectorToScalarSplat - Since there's no native support for
/// scalar_to_vector for 256-bit AVX, a 128-bit scalar_to_vector +
/// INSERT_SUBVECTOR is generated. Recognize this idiom and do the
/// shuffle before the insertion, this yields less instructions in the end.
static SDValue PromoteVectorToScalarSplat(ShuffleVectorSDNode *SV,
                                          SelectionDAG &DAG) {
  EVT SrcVT = SV->getValueType(0);
  SDValue V1 = SV->getOperand(0);
  DebugLoc dl = SV->getDebugLoc();
  int NumElems = SrcVT.getVectorNumElements();

  assert(SrcVT.is256BitVector() && "unknown howto handle vector type");

  SmallVector<int, 4> Mask;
  for (int i = 0; i < NumElems/2; ++i)
    Mask.push_back(SV->getMaskElt(i));

  EVT SVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
                             NumElems/2);
  SDValue SV1 = DAG.getVectorShuffle(SVT, dl, V1.getOperand(1),
                                     DAG.getUNDEF(SVT), &Mask[0]);
  SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), SV1,
                                    DAG.getConstant(0, MVT::i32), DAG, dl);

  return Insert128BitVector(InsV, SV1,
                       DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
}

/// PromoteSplat - Promote a splat of v4i32, v8i16 or v16i8 to v4f32 and
/// v8i32, v16i16 or v32i8 to v8f32.
static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
  EVT SrcVT = SV->getValueType(0);
  SDValue V1 = SV->getOperand(0);
  DebugLoc dl = SV->getDebugLoc();

  int EltNo = SV->getSplatIndex();
  int NumElems = SrcVT.getVectorNumElements();
  unsigned Size = SrcVT.getSizeInBits();

  // Extract the 128-bit part containing the splat element and update
  // the splat element index when it refers to the higher register.
  if (Size == 256) {
    unsigned Idx = (EltNo > NumElems/2) ? NumElems/2 : 0;
    V1 = Extract128BitVector(V1, DAG.getConstant(Idx, MVT::i32), DAG, dl);
    if (Idx > 0)
      EltNo -= NumElems/2;
  }

  // Make this 128-bit vector duplicate i8 and i16 elements
  if (NumElems > 4)
    V1 = PromoteSplatv8v16(V1, DAG, EltNo);

  // Recreate the 256-bit vector and place the same 128-bit vector
  // into the low and high part. This is necessary because we want
  // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles
  // inside each separate v4f32 lane.
  if (Size == 256) {
    SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
                         DAG.getConstant(0, MVT::i32), DAG, dl);
    V1 = Insert128BitVector(InsV, V1,
               DAG.getConstant(NumElems/2, MVT::i32), DAG, dl);
  }

  return getLegalSplat(DAG, V1, EltNo);
}

/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
/// vector of zero or undef vector.  This produces a shuffle where the low
/// element of V2 is swizzled into the zero/undef vector, landing at element
/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
                                             bool isZero, bool HasSSE2,
                                             SelectionDAG &DAG) {
  EVT VT = V2.getValueType();
  SDValue V1 = isZero
    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
  unsigned NumElems = VT.getVectorNumElements();
  SmallVector<int, 16> MaskVec;
  for (unsigned i = 0; i != NumElems; ++i)
    // If this is the insertion idx, put the low elt of V2 here.
    MaskVec.push_back(i == Idx ? NumElems : i);
  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
}

/// getShuffleScalarElt - Returns the scalar element that will make up the ith
/// element of the result of the vector shuffle.
static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
                                   unsigned Depth) {
  if (Depth == 6)
    return SDValue();  // Limit search depth.

  SDValue V = SDValue(N, 0);
  EVT VT = V.getValueType();
  unsigned Opcode = V.getOpcode();

  // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
    Index = SV->getMaskElt(Index);

    if (Index < 0)
      return DAG.getUNDEF(VT.getVectorElementType());

    int NumElems = VT.getVectorNumElements();
    SDValue NewV = (Index < NumElems) ? SV->getOperand(0) : SV->getOperand(1);
    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG, Depth+1);
  }

  // Recurse into target specific vector shuffles to find scalars.
  if (isTargetShuffle(Opcode)) {
    int NumElems = VT.getVectorNumElements();
    SmallVector<unsigned, 16> ShuffleMask;
    SDValue ImmN;

    switch(Opcode) {
    case X86ISD::SHUFPS:
    case X86ISD::SHUFPD:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeSHUFPSMask(NumElems,
                       cast<ConstantSDNode>(ImmN)->getZExtValue(),
                       ShuffleMask);
      break;
    case X86ISD::PUNPCKHBW:
    case X86ISD::PUNPCKHWD:
    case X86ISD::PUNPCKHDQ:
    case X86ISD::PUNPCKHQDQ:
      DecodePUNPCKHMask(NumElems, ShuffleMask);
      break;
    case X86ISD::UNPCKHPS:
    case X86ISD::UNPCKHPD:
    case X86ISD::VUNPCKHPSY:
    case X86ISD::VUNPCKHPDY:
      DecodeUNPCKHPMask(NumElems, ShuffleMask);
      break;
    case X86ISD::PUNPCKLBW:
    case X86ISD::PUNPCKLWD:
    case X86ISD::PUNPCKLDQ:
    case X86ISD::PUNPCKLQDQ:
      DecodePUNPCKLMask(VT, ShuffleMask);
      break;
    case X86ISD::UNPCKLPS:
    case X86ISD::UNPCKLPD:
    case X86ISD::VUNPCKLPSY:
    case X86ISD::VUNPCKLPDY:
      DecodeUNPCKLPMask(VT, ShuffleMask);
      break;
    case X86ISD::MOVHLPS:
      DecodeMOVHLPSMask(NumElems, ShuffleMask);
      break;
    case X86ISD::MOVLHPS:
      DecodeMOVLHPSMask(NumElems, ShuffleMask);
      break;
    case X86ISD::PSHUFD:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodePSHUFMask(NumElems,
                      cast<ConstantSDNode>(ImmN)->getZExtValue(),
                      ShuffleMask);
      break;
    case X86ISD::PSHUFHW:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodePSHUFHWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
      break;
    case X86ISD::PSHUFLW:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodePSHUFLWMask(cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
      break;
    case X86ISD::MOVSS:
    case X86ISD::MOVSD: {
      // The index 0 always comes from the first element of the second source,
      // this is why MOVSS and MOVSD are used in the first place. The other
      // elements come from the other positions of the first source vector.
      unsigned OpNum = (Index == 0) ? 1 : 0;
      return getShuffleScalarElt(V.getOperand(OpNum).getNode(), Index, DAG,
                                 Depth+1);
    }
    case X86ISD::VPERMILPS:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERMILPSMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
      break;
    case X86ISD::VPERMILPSY:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERMILPSMask(8, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
      break;
    case X86ISD::VPERMILPD:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERMILPDMask(2, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
      break;
    case X86ISD::VPERMILPDY:
      ImmN = N->getOperand(N->getNumOperands()-1);
      DecodeVPERMILPDMask(4, cast<ConstantSDNode>(ImmN)->getZExtValue(),
                        ShuffleMask);
      break;
    default:
      assert("not implemented for target shuffle node");
      return SDValue();
    }

    Index = ShuffleMask[Index];
    if (Index < 0)
      return DAG.getUNDEF(VT.getVectorElementType());

    SDValue NewV = (Index < NumElems) ? N->getOperand(0) : N->getOperand(1);
    return getShuffleScalarElt(NewV.getNode(), Index % NumElems, DAG,
                               Depth+1);
  }

  // Actual nodes that may contain scalar elements
  if (Opcode == ISD::BITCAST) {
    V = V.getOperand(0);
    EVT SrcVT = V.getValueType();
    unsigned NumElems = VT.getVectorNumElements();

    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
      return SDValue();
  }

  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
    return (Index == 0) ? V.getOperand(0)
                          : DAG.getUNDEF(VT.getVectorElementType());

  if (V.getOpcode() == ISD::BUILD_VECTOR)
    return V.getOperand(Index);

  return SDValue();
}

/// getNumOfConsecutiveZeros - Return the number of elements of a vector
/// shuffle operation which come from a consecutively from a zero. The
/// search can start in two different directions, from left or right.
static
unsigned getNumOfConsecutiveZeros(SDNode *N, int NumElems,
                                  bool ZerosFromLeft, SelectionDAG &DAG) {
  int i = 0;

  while (i < NumElems) {
    unsigned Index = ZerosFromLeft ? i : NumElems-i-1;
    SDValue Elt = getShuffleScalarElt(N, Index, DAG, 0);
    if (!(Elt.getNode() &&
         (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt))))
      break;
    ++i;
  }

  return i;
}

/// isShuffleMaskConsecutive - Check if the shuffle mask indicies from MaskI to
/// MaskE correspond consecutively to elements from one of the vector operands,
/// starting from its index OpIdx. Also tell OpNum which source vector operand.
static
bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, int MaskI, int MaskE,
                              int OpIdx, int NumElems, unsigned &OpNum) {
  bool SeenV1 = false;
  bool SeenV2 = false;

  for (int i = MaskI; i <= MaskE; ++i, ++OpIdx) {
    int Idx = SVOp->getMaskElt(i);
    // Ignore undef indicies
    if (Idx < 0)
      continue;

    if (Idx < NumElems)
      SeenV1 = true;
    else
      SeenV2 = true;

    // Only accept consecutive elements from the same vector
    if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
      return false;
  }

  OpNum = SeenV1 ? 0 : 1;
  return true;
}

/// isVectorShiftRight - Returns true if the shuffle can be implemented as a
/// logical left shift of a vector.
static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
              false /* check zeros from right */, DAG);
  unsigned OpSrc;

  if (!NumZeros)
    return false;

  // Considering the elements in the mask that are not consecutive zeros,
  // check if they consecutively come from only one of the source vectors.
  //
  //               V1 = {X, A, B, C}     0
  //                         \  \  \    /
  //   vector_shuffle V1, V2 <1, 2, 3, X>
  //
  if (!isShuffleMaskConsecutive(SVOp,
            0,                   // Mask Start Index
            NumElems-NumZeros-1, // Mask End Index
            NumZeros,            // Where to start looking in the src vector
            NumElems,            // Number of elements in vector
            OpSrc))              // Which source operand ?
    return false;

  isLeft = false;
  ShAmt = NumZeros;
  ShVal = SVOp->getOperand(OpSrc);
  return true;
}

/// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
/// logical left shift of a vector.
static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                              bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
  unsigned NumElems = SVOp->getValueType(0).getVectorNumElements();
  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems,
              true /* check zeros from left */, DAG);
  unsigned OpSrc;

  if (!NumZeros)
    return false;

  // Considering the elements in the mask that are not consecutive zeros,
  // check if they consecutively come from only one of the source vectors.
  //
  //                           0    { A, B, X, X } = V2
  //                          / \    /  /
  //   vector_shuffle V1, V2 <X, X, 4, 5>
  //
  if (!isShuffleMaskConsecutive(SVOp,
            NumZeros,     // Mask Start Index
            NumElems-1,   // Mask End Index
            0,            // Where to start looking in the src vector
            NumElems,     // Number of elements in vector
            OpSrc))       // Which source operand ?
    return false;

  isLeft = true;
  ShAmt = NumZeros;
  ShVal = SVOp->getOperand(OpSrc);
  return true;
}

/// isVectorShift - Returns true if the shuffle can be implemented as a
/// logical left or right shift of a vector.
static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
  if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
      isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
    return true;

  return false;
}

/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
///
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
                                       unsigned NumNonZero, unsigned NumZero,
                                       SelectionDAG &DAG,
                                       const TargetLowering &TLI) {
  if (NumNonZero > 8)
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();
  SDValue V(0, 0);
  bool First = true;
  for (unsigned i = 0; i < 16; ++i) {
    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
    if (ThisIsNonZero && First) {
      if (NumZero)
        V = getZeroVector(MVT::v8i16, true, DAG, dl);
      else
        V = DAG.getUNDEF(MVT::v8i16);
      First = false;
    }

    if ((i & 1) != 0) {
      SDValue ThisElt(0, 0), LastElt(0, 0);
      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
      if (LastIsNonZero) {
        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
                              MVT::i16, Op.getOperand(i-1));
      }
      if (ThisIsNonZero) {
        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
                              ThisElt, DAG.getConstant(8, MVT::i8));
        if (LastIsNonZero)
          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
      } else
        ThisElt = LastElt;

      if (ThisElt.getNode())
        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
                        DAG.getIntPtrConstant(i/2));
    }
  }

  return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
}

/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
///
static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                     unsigned NumNonZero, unsigned NumZero,
                                     SelectionDAG &DAG,
                                     const TargetLowering &TLI) {
  if (NumNonZero > 4)
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();
  SDValue V(0, 0);
  bool First = true;
  for (unsigned i = 0; i < 8; ++i) {
    bool isNonZero = (NonZeros & (1 << i)) != 0;
    if (isNonZero) {
      if (First) {
        if (NumZero)
          V = getZeroVector(MVT::v8i16, true, DAG, dl);
        else
          V = DAG.getUNDEF(MVT::v8i16);
        First = false;
      }
      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
                      MVT::v8i16, V, Op.getOperand(i),
                      DAG.getIntPtrConstant(i));
    }
  }

  return V;
}

/// getVShift - Return a vector logical shift node.
///
static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
                         unsigned NumBits, SelectionDAG &DAG,
                         const TargetLowering &TLI, DebugLoc dl) {
  EVT ShVT = MVT::v2i64;
  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
  SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
  return DAG.getNode(ISD::BITCAST, dl, VT,
                     DAG.getNode(Opc, dl, ShVT, SrcOp,
                             DAG.getConstant(NumBits,
                                  TLI.getShiftAmountTy(SrcOp.getValueType()))));
}

SDValue
X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                                          SelectionDAG &DAG) const {

  // Check if the scalar load can be widened into a vector load. And if
  // the address is "base + cst" see if the cst can be "absorbed" into
  // the shuffle mask.
  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
    SDValue Ptr = LD->getBasePtr();
    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
      return SDValue();
    EVT PVT = LD->getValueType(0);
    if (PVT != MVT::i32 && PVT != MVT::f32)
      return SDValue();

    int FI = -1;
    int64_t Offset = 0;
    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
      FI = FINode->getIndex();
      Offset = 0;
    } else if (DAG.isBaseWithConstantOffset(Ptr) &&
               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
      Offset = Ptr.getConstantOperandVal(1);
      Ptr = Ptr.getOperand(0);
    } else {
      return SDValue();
    }

    SDValue Chain = LD->getChain();
    // Make sure the stack object alignment is at least 16.
    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
    if (DAG.InferPtrAlignment(Ptr) < 16) {
      if (MFI->isFixedObjectIndex(FI)) {
        // Can't change the alignment. FIXME: It's possible to compute
        // the exact stack offset and reference FI + adjust offset instead.
        // If someone *really* cares about this. That's the way to implement it.
        return SDValue();
      } else {
        MFI->setObjectAlignment(FI, 16);
      }
    }

    // (Offset % 16) must be multiple of 4. Then address is then
    // Ptr + (Offset & ~15).
    if (Offset < 0)
      return SDValue();
    if ((Offset % 16) & 3)
      return SDValue();
    int64_t StartOffset = Offset & ~15;
    if (StartOffset)
      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));

    int EltNo = (Offset - StartOffset) >> 2;
    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,
                             LD->getPointerInfo().getWithOffset(StartOffset),
                             false, false, 0);
    // Canonicalize it to a v4i32 shuffle.
    V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
    return DAG.getNode(ISD::BITCAST, dl, VT,
                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
                                            DAG.getUNDEF(MVT::v4i32),&Mask[0]));
  }

  return SDValue();
}

/// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
/// vector of type 'VT', see if the elements can be replaced by a single large
/// load which has the same value as a build_vector whose operands are 'elts'.
///
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
///
/// FIXME: we'd also like to handle the case where the last elements are zero
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
/// There's even a handy isZeroNode for that purpose.
static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
                                        DebugLoc &DL, SelectionDAG &DAG) {
  EVT EltVT = VT.getVectorElementType();
  unsigned NumElems = Elts.size();

  LoadSDNode *LDBase = NULL;
  unsigned LastLoadedElt = -1U;

  // For each element in the initializer, see if we've found a load or an undef.
  // If we don't find an initial load element, or later load elements are
  // non-consecutive, bail out.
  for (unsigned i = 0; i < NumElems; ++i) {
    SDValue Elt = Elts[i];

    if (!Elt.getNode() ||
        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
      return SDValue();
    if (!LDBase) {
      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
        return SDValue();
      LDBase = cast<LoadSDNode>(Elt.getNode());
      LastLoadedElt = i;
      continue;
    }
    if (Elt.getOpcode() == ISD::UNDEF)
      continue;

    LoadSDNode *LD = cast<LoadSDNode>(Elt);
    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
      return SDValue();
    LastLoadedElt = i;
  }

  // If we have found an entire vector of loads and undefs, then return a large
  // load of the entire vector width starting at the base pointer.  If we found
  // consecutive loads for the low half, generate a vzext_load node.
  if (LastLoadedElt == NumElems - 1) {
    if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
      return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                         LDBase->getPointerInfo(),
                         LDBase->isVolatile(), LDBase->isNonTemporal(), 0);
    return DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
                       LDBase->getPointerInfo(),
                       LDBase->isVolatile(), LDBase->isNonTemporal(),
                       LDBase->getAlignment());
  } else if (NumElems == 4 && LastLoadedElt == 1 &&
             DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
    SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
    SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys,
                                              Ops, 2, MVT::i32,
                                              LDBase->getMemOperand());
    return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  DebugLoc dl = Op.getDebugLoc();

  EVT VT = Op.getValueType();
  EVT ExtVT = VT.getVectorElementType();
  unsigned NumElems = Op.getNumOperands();

  // All zero's:
  //  - pxor (SSE2), xorps (SSE1), vpxor (128 AVX), xorp[s|d] (256 AVX)
  // All one's:
  //  - pcmpeqd (SSE2 and 128 AVX), fallback to constant pools (256 AVX)
  if (ISD::isBuildVectorAllZeros(Op.getNode()) ||
      ISD::isBuildVectorAllOnes(Op.getNode())) {
    // Canonicalize this to <4 x i32> or <8 x 32> (SSE) to
    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
    // eliminated on x86-32 hosts.
    if (Op.getValueType() == MVT::v4i32 ||
        Op.getValueType() == MVT::v8i32)
      return Op;

    if (ISD::isBuildVectorAllOnes(Op.getNode()))
      return getOnesVector(Op.getValueType(), DAG, dl);
    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
  }

  unsigned EVTBits = ExtVT.getSizeInBits();

  unsigned NumZero  = 0;
  unsigned NumNonZero = 0;
  unsigned NonZeros = 0;
  bool IsAllConstants = true;
  SmallSet<SDValue, 8> Values;
  for (unsigned i = 0; i < NumElems; ++i) {
    SDValue Elt = Op.getOperand(i);
    if (Elt.getOpcode() == ISD::UNDEF)
      continue;
    Values.insert(Elt);
    if (Elt.getOpcode() != ISD::Constant &&
        Elt.getOpcode() != ISD::ConstantFP)
      IsAllConstants = false;
    if (X86::isZeroNode(Elt))
      NumZero++;
    else {
      NonZeros |= (1 << i);
      NumNonZero++;
    }
  }

  // All undef vector. Return an UNDEF.  All zero vectors were handled above.
  if (NumNonZero == 0)
    return DAG.getUNDEF(VT);

  // Special case for single non-zero, non-undef, element.
  if (NumNonZero == 1) {
    unsigned Idx = CountTrailingZeros_32(NonZeros);
    SDValue Item = Op.getOperand(Idx);

    // If this is an insertion of an i64 value on x86-32, and if the top bits of
    // the value are obviously zero, truncate the value to i32 and do the
    // insertion that way.  Only do this if the value is non-constant or if the
    // value is a constant being inserted into element 0.  It is cheaper to do
    // a constant pool load than it is to do a movd + shuffle.
    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
        (!IsAllConstants || Idx == 0)) {
      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
        // Handle SSE only.
        assert(VT == MVT::v2i64 && "Expected an SSE value type!");
        EVT VecVT = MVT::v4i32;
        unsigned VecElts = 4;

        // Truncate the value (which may itself be a constant) to i32, and
        // convert it to a vector with movd (S2V+shuffle to zero extend).
        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
                                           Subtarget->hasSSE2(), DAG);

        // Now we have our 32-bit value zero extended in the low element of
        // a vector.  If Idx != 0, swizzle it into place.
        if (Idx != 0) {
          SmallVector<int, 4> Mask;
          Mask.push_back(Idx);
          for (unsigned i = 1; i != VecElts; ++i)
            Mask.push_back(i);
          Item = DAG.getVectorShuffle(VecVT, dl, Item,
                                      DAG.getUNDEF(Item.getValueType()),
                                      &Mask[0]);
        }
        return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Item);
      }
    }

    // If we have a constant or non-constant insertion into the low element of
    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
    // depending on what the source datatype is.
    if (Idx == 0) {
      if (NumZero == 0) {
        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
                                           DAG);
      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
        assert(VT.getSizeInBits() == 128 && "Expected an SSE value type!");
        EVT MiddleVT = MVT::v4i32;
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
                                           Subtarget->hasSSE2(), DAG);
        return DAG.getNode(ISD::BITCAST, dl, VT, Item);
      }
    }

    // Is it a vector logical left shift?
    if (NumElems == 2 && Idx == 1 &&
        X86::isZeroNode(Op.getOperand(0)) &&
        !X86::isZeroNode(Op.getOperand(1))) {
      unsigned NumBits = VT.getSizeInBits();
      return getVShift(true, VT,
                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                   VT, Op.getOperand(1)),
                       NumBits/2, DAG, *this, dl);
    }

    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
      return SDValue();

    // Otherwise, if this is a vector with i32 or f32 elements, and the element
    // is a non-constant being inserted into an element other than the low one,
    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
    // movd/movss) to move this into the low element, then shuffle it into
    // place.
    if (EVTBits == 32) {
      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

      // Turn it into a shuffle of zero and zero-extended scalar to vector.
      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
                                         Subtarget->hasSSE2(), DAG);
      SmallVector<int, 8> MaskVec;
      for (unsigned i = 0; i < NumElems; i++)
        MaskVec.push_back(i == Idx ? 0 : 1);
      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
    }
  }

  // Splat is obviously ok. Let legalizer expand it to a shuffle.
  if (Values.size() == 1) {
    if (EVTBits == 32) {
      // Instead of a shuffle like this:
      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
      // Check if it's possible to issue this instead.
      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
      unsigned Idx = CountTrailingZeros_32(NonZeros);
      SDValue Item = Op.getOperand(Idx);
      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
    }
    return SDValue();
  }

  // A vector full of immediates; various special cases are already
  // handled, so this is best done with a single constant-pool load.
  if (IsAllConstants)
    return SDValue();

  // For AVX-length vectors, build the individual 128-bit pieces and use
  // shuffles to put them in place.
  if (VT.getSizeInBits() == 256 && !ISD::isBuildVectorAllZeros(Op.getNode())) {
    SmallVector<SDValue, 32> V;
    for (unsigned i = 0; i < NumElems; ++i)
      V.push_back(Op.getOperand(i));

    EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);

    // Build both the lower and upper subvector.
    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
                                NumElems/2);

    // Recreate the wider vector with the lower and upper part.
    SDValue Vec = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, VT), Lower,
                                DAG.getConstant(0, MVT::i32), DAG, dl);
    return Insert128BitVector(Vec, Upper, DAG.getConstant(NumElems/2, MVT::i32),
                              DAG, dl);
  }

  // Let legalizer expand 2-wide build_vectors.
  if (EVTBits == 64) {
    if (NumNonZero == 1) {
      // One half is zero or undef.
      unsigned Idx = CountTrailingZeros_32(NonZeros);
      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
                                 Op.getOperand(Idx));
      return getShuffleVectorZeroOrUndef(V2, Idx, true,
                                         Subtarget->hasSSE2(), DAG);
    }
    return SDValue();
  }

  // If element VT is < 32 bits, convert it to inserts into a zero vector.
  if (EVTBits == 8 && NumElems == 16) {
    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
                                        *this);
    if (V.getNode()) return V;
  }

  if (EVTBits == 16 && NumElems == 8) {
    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
                                      *this);
    if (V.getNode()) return V;
  }

  // If element VT is == 32 bits, turn it into a number of shuffles.
  SmallVector<SDValue, 8> V;
  V.resize(NumElems);
  if (NumElems == 4 && NumZero > 0) {
    for (unsigned i = 0; i < 4; ++i) {
      bool isZero = !(NonZeros & (1 << i));
      if (isZero)
        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
      else
        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
    }

    for (unsigned i = 0; i < 2; ++i) {
      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
        default: break;
        case 0:
          V[i] = V[i*2];  // Must be a zero vector.
          break;
        case 1:
          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
          break;
        case 2:
          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
          break;
        case 3:
          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
          break;
      }
    }

    SmallVector<int, 8> MaskVec;
    bool Reverse = (NonZeros & 0x3) == 2;
    for (unsigned i = 0; i < 2; ++i)
      MaskVec.push_back(Reverse ? 1-i : i);
    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
    for (unsigned i = 0; i < 2; ++i)
      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
  }

  if (Values.size() > 1 && VT.getSizeInBits() == 128) {
    // Check for a build vector of consecutive loads.
    for (unsigned i = 0; i < NumElems; ++i)
      V[i] = Op.getOperand(i);

    // Check for elements which are consecutive loads.
    SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG);
    if (LD.getNode())
      return LD;

    // For SSE 4.1, use insertps to put the high elements into the low element.
    if (getSubtarget()->hasSSE41()) {
      SDValue Result;
      if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
        Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
      else
        Result = DAG.getUNDEF(VT);

      for (unsigned i = 1; i < NumElems; ++i) {
        if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
        Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
                             Op.getOperand(i), DAG.getIntPtrConstant(i));
      }
      return Result;
    }

    // Otherwise, expand into a number of unpckl*, start by extending each of
    // our (non-undef) elements to the full vector width with the element in the
    // bottom slot of the vector (which generates no code for SSE).
    for (unsigned i = 0; i < NumElems; ++i) {
      if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
      else
        V[i] = DAG.getUNDEF(VT);
    }

    // Next, we iteratively mix elements, e.g. for v4f32:
    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
    unsigned EltStride = NumElems >> 1;
    while (EltStride != 0) {
      for (unsigned i = 0; i < EltStride; ++i) {
        // If V[i+EltStride] is undef and this is the first round of mixing,
        // then it is safe to just drop this shuffle: V[i] is already in the
        // right place, the one element (since it's the first round) being
        // inserted as undef can be dropped.  This isn't safe for successive
        // rounds because they will permute elements within both vectors.
        if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
            EltStride == NumElems/2)
          continue;

        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
      }
      EltStride >>= 1;
    }
    return V[0];
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
  // We support concatenate two MMX registers and place them in a MMX
  // register.  This is better than doing a stack convert.
  DebugLoc dl = Op.getDebugLoc();
  EVT ResVT = Op.getValueType();
  assert(Op.getNumOperands() == 2);
  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
  int Mask[2];
  SDValue InVec = DAG.getNode(ISD::BITCAST,dl, MVT::v1i64, Op.getOperand(0));
  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
  InVec = Op.getOperand(1);
  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
    unsigned NumElts = ResVT.getVectorNumElements();
    VecOp = DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
  } else {
    InVec = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, InVec);
    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
    Mask[0] = 0; Mask[1] = 2;
    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
  }
  return DAG.getNode(ISD::BITCAST, dl, ResVT, VecOp);
}

// v8i16 shuffles - Prefer shuffles in the following order: