X86ISelLowering.cpp

  //    shuffle nodes.
  // 3) Rewriting of unmatched masks into new generic shuffle operations,
  //    so the shuffle can be broken into other shuffles and the legalizer can
  //    try the lowering again.
  //
  // The general ideia is that no vector_shuffle operation should be left to
  // be matched during isel, all of them must be converted to a target specific
  // node here.

  // Normalize the input vectors. Here splats, zeroed vectors, profitable
  // narrowing and commutation of operands should be handled. The actual code
  // doesn't include all of those, work in progress...
  SDValue NewOp = NormalizeVectorShuffle(Op, DAG, *this, Subtarget);
  if (NewOp.getNode())
    return NewOp;

  // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
  // unpckh_undef). Only use pshufd if speed is more important than size.
  if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
  if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);

  if (X86::isMOVDDUPMask(SVOp) && HasSSE3 && V2IsUndef &&
      RelaxedMayFoldVectorLoad(V1))
    return getMOVDDup(Op, dl, V1, DAG);

  if (X86::isMOVHLPS_v_undef_Mask(SVOp))
    return getMOVHighToLow(Op, dl, DAG);

  // Use to match splats
  if (HasSSE2 && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
      (VT == MVT::v2f64 || VT == MVT::v2i64))
    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);

  if (X86::isPSHUFDMask(SVOp)) {
    // The actual implementation will match the mask in the if above and then
    // during isel it can match several different instructions, not only pshufd
    // as its name says, sad but true, emulate the behavior for now...
    if (X86::isMOVDDUPMask(SVOp) && ((VT == MVT::v4f32 || VT == MVT::v2i64)))
        return getTargetShuffleNode(X86ISD::MOVLHPS, dl, VT, V1, V1, DAG);

    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);

    if (HasSSE2 && (VT == MVT::v4f32 || VT == MVT::v4i32))
      return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1, TargetMask, DAG);

    if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V1,
                                  TargetMask, DAG);

    if (VT == MVT::v4f32)
      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V1,
                                  TargetMask, DAG);
  }

  // Check if this can be converted into a logical shift.
  bool isLeft = false;
  unsigned ShAmt = 0;
  SDValue ShVal;
  bool isShift = getSubtarget()->hasSSE2() &&
    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
  if (isShift && ShVal.hasOneUse()) {
    // If the shifted value has multiple uses, it may be cheaper to use
    // v_set0 + movlhps or movhlps, etc.
    EVT EltVT = VT.getVectorElementType();
    ShAmt *= EltVT.getSizeInBits();
    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
  }

  if (X86::isMOVLMask(SVOp)) {
    if (V1IsUndef)
      return V2;
    if (ISD::isBuildVectorAllZeros(V1.getNode()))
      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
    if (!X86::isMOVLPMask(SVOp)) {
      if (HasSSE2 && (VT == MVT::v2i64 || VT == MVT::v2f64))
        return getTargetShuffleNode(X86ISD::MOVSD, dl, VT, V1, V2, DAG);

      if (VT == MVT::v4i32 || VT == MVT::v4f32)
        return getTargetShuffleNode(X86ISD::MOVSS, dl, VT, V1, V2, DAG);
    }
  }

  // FIXME: fold these into legal mask.
  if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
    return getMOVLowToHigh(Op, dl, DAG, HasSSE2);

  if (X86::isMOVHLPSMask(SVOp))
    return getMOVHighToLow(Op, dl, DAG);

  if (X86::isMOVSHDUPMask(SVOp, Subtarget))
    return getTargetShuffleNode(X86ISD::MOVSHDUP, dl, VT, V1, DAG);

  if (X86::isMOVSLDUPMask(SVOp, Subtarget))
    return getTargetShuffleNode(X86ISD::MOVSLDUP, dl, VT, V1, DAG);

  if (X86::isMOVLPMask(SVOp))
    return getMOVLP(Op, dl, DAG, HasSSE2);

  if (ShouldXformToMOVHLPS(SVOp) ||
      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
    return CommuteVectorShuffle(SVOp, DAG);

  if (isShift) {
    // No better options. Use a vshl / vsrl.
    EVT EltVT = VT.getVectorElementType();
    ShAmt *= EltVT.getSizeInBits();
    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
  }

  bool Commuted = false;
  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
  // 1,1,1,1 -> v8i16 though.
  V1IsSplat = isSplatVector(V1.getNode());
  V2IsSplat = isSplatVector(V2.getNode());

  // Canonicalize the splat or undef, if present, to be on the RHS.
  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
    Op = CommuteVectorShuffle(SVOp, DAG);
    SVOp = cast<ShuffleVectorSDNode>(Op);
    V1 = SVOp->getOperand(0);
    V2 = SVOp->getOperand(1);
    std::swap(V1IsSplat, V2IsSplat);
    std::swap(V1IsUndef, V2IsUndef);
    Commuted = true;
  }

  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
    // Shuffling low element of v1 into undef, just return v1.
    if (V2IsUndef)
      return V1;
    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
    // the instruction selector will not match, so get a canonical MOVL with
    // swapped operands to undo the commute.
    return getMOVL(DAG, dl, VT, V2, V1);
  }

  if (X86::isUNPCKLMask(SVOp))
    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);

  if (X86::isUNPCKHMask(SVOp))
    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);

  if (V2IsSplat) {
    // Normalize mask so all entries that point to V2 points to its first
    // element then try to match unpck{h|l} again. If match, return a
    // new vector_shuffle with the corrected mask.
    SDValue NewMask = NormalizeMask(SVOp, DAG);
    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
    if (NSVOp != SVOp) {
      if (X86::isUNPCKLMask(NSVOp, true)) {
        return NewMask;
      } else if (X86::isUNPCKHMask(NSVOp, true)) {
        return NewMask;
      }
    }
  }

  if (Commuted) {
    // Commute is back and try unpck* again.
    // FIXME: this seems wrong.
    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);

    if (X86::isUNPCKLMask(NewSVOp))
      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);

    if (X86::isUNPCKHMask(NewSVOp))
      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
  }

  // Normalize the node to match x86 shuffle ops if needed
  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
    return CommuteVectorShuffle(SVOp, DAG);

  // The checks below are all present in isShuffleMaskLegal, but they are
  // inlined here right now to enable us to directly emit target specific
  // nodes, and remove one by one until they don't return Op anymore.
  SmallVector<int, 16> M;
  SVOp->getMask(M);

  if (isPALIGNRMask(M, VT, HasSSSE3))
    return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
                                X86::getShufflePALIGNRImmediate(SVOp),
                                DAG);

  if (ShuffleVectorSDNode::isSplatMask(&M[0], VT) &&
      SVOp->getSplatIndex() == 0 && V2IsUndef) {
    if (VT == MVT::v2f64)
      return getTargetShuffleNode(X86ISD::UNPCKLPD, dl, VT, V1, V1, DAG);
    if (VT == MVT::v2i64)
      return getTargetShuffleNode(X86ISD::PUNPCKLQDQ, dl, VT, V1, V1, DAG);
  }

  if (isPSHUFHWMask(M, VT))
    return getTargetShuffleNode(X86ISD::PSHUFHW, dl, VT, V1,
                                X86::getShufflePSHUFHWImmediate(SVOp),
                                DAG);

  if (isPSHUFLWMask(M, VT))
    return getTargetShuffleNode(X86ISD::PSHUFLW, dl, VT, V1,
                                X86::getShufflePSHUFLWImmediate(SVOp),
                                DAG);

  if (isSHUFPMask(M, VT)) {
    unsigned TargetMask = X86::getShuffleSHUFImmediate(SVOp);
    if (VT == MVT::v4f32 || VT == MVT::v4i32)
      return getTargetShuffleNode(X86ISD::SHUFPS, dl, VT, V1, V2,
                                  TargetMask, DAG);
    if (VT == MVT::v2f64 || VT == MVT::v2i64)
      return getTargetShuffleNode(X86ISD::SHUFPD, dl, VT, V1, V2,
                                  TargetMask, DAG);
  }

  if (X86::isUNPCKL_v_undef_Mask(SVOp))
    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
  if (X86::isUNPCKH_v_undef_Mask(SVOp))
    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);

  //===--------------------------------------------------------------------===//
  // Generate target specific nodes for 128 or 256-bit shuffles only
  // supported in the AVX instruction set.
  //

  // Handle VPERMILPS* permutations
  if (isVPERMILPSMask(M, VT, Subtarget))
    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
                                getShuffleVPERMILPSImmediate(SVOp), DAG);

  // Handle VPERMILPD* permutations
  if (isVPERMILPDMask(M, VT, Subtarget))
    return getTargetShuffleNode(getVPERMILOpcode(VT), dl, VT, V1,
                                getShuffleVPERMILPDImmediate(SVOp), DAG);

  //===--------------------------------------------------------------------===//
  // Since no target specific shuffle was selected for this generic one,
  // lower it into other known shuffles. FIXME: this isn't true yet, but
  // this is the plan.
  //

  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
  if (VT == MVT::v8i16) {
    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
    if (NewOp.getNode())
      return NewOp;
  }

  if (VT == MVT::v16i8) {
    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
    if (NewOp.getNode())
      return NewOp;
  }

  // Handle all 128-bit wide vectors with 4 elements, and match them with
  // several different shuffle types.
  if (NumElems == 4 && VT.getSizeInBits() == 128)
    return LowerVECTOR_SHUFFLE_128v4(SVOp, DAG);

  // Handle general 256-bit shuffles
  if (VT.is256BitVector())
    return LowerVECTOR_SHUFFLE_256(SVOp, DAG);

  return SDValue();
}

SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                SelectionDAG &DAG) const {
  EVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();

  if (Op.getOperand(0).getValueType().getSizeInBits() != 128)
    return SDValue();

  if (VT.getSizeInBits() == 8) {
    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT.getSizeInBits() == 16) {
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
    if (Idx == 0)
      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                     DAG.getNode(ISD::BITCAST, dl,
                                                 MVT::v4i32,
                                                 Op.getOperand(0)),
                                     Op.getOperand(1)));
    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT == MVT::f32) {
    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
    // the result back to FR32 register. It's only worth matching if the
    // result has a single use which is a store or a bitcast to i32.  And in
    // the case of a store, it's not worth it if the index is a constant 0,
    // because a MOVSSmr can be used instead, which is smaller and faster.
    if (!Op.hasOneUse())
      return SDValue();
    SDNode *User = *Op.getNode()->use_begin();
    if ((User->getOpcode() != ISD::STORE ||
         (isa<ConstantSDNode>(Op.getOperand(1)) &&
          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
        (User->getOpcode() != ISD::BITCAST ||
         User->getValueType(0) != MVT::i32))
      return SDValue();
    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                  DAG.getNode(ISD::BITCAST, dl, MVT::v4i32,
                                              Op.getOperand(0)),
                                              Op.getOperand(1));
    return DAG.getNode(ISD::BITCAST, dl, MVT::f32, Extract);
  } else if (VT == MVT::i32) {
    // ExtractPS works with constant index.
    if (isa<ConstantSDNode>(Op.getOperand(1)))
      return Op;
  }
  return SDValue();
}


SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                                           SelectionDAG &DAG) const {
  if (!isa<ConstantSDNode>(Op.getOperand(1)))
    return SDValue();

  SDValue Vec = Op.getOperand(0);
  EVT VecVT = Vec.getValueType();

  // If this is a 256-bit vector result, first extract the 128-bit vector and
  // then extract the element from the 128-bit vector.
  if (VecVT.getSizeInBits() == 256) {
    DebugLoc dl = Op.getNode()->getDebugLoc();
    unsigned NumElems = VecVT.getVectorNumElements();
    SDValue Idx = Op.getOperand(1);
    unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

    // Get the 128-bit vector.
    bool Upper = IdxVal >= NumElems/2;
    Vec = Extract128BitVector(Vec,
                    DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32), DAG, dl);

    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
                    Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : Idx);
  }

  assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");

  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
    if (Res.getNode())
      return Res;
  }

  EVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  // TODO: handle v16i8.
  if (VT.getSizeInBits() == 16) {
    SDValue Vec = Op.getOperand(0);
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                     DAG.getNode(ISD::BITCAST, dl,
                                                 MVT::v4i32, Vec),
                                     Op.getOperand(1)));
    // Transform it so it match pextrw which produces a 32-bit result.
    EVT EltVT = MVT::i32;
    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT.getSizeInBits() == 32) {
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return Op;

    // SHUFPS the element to the lowest double word, then movss.
    int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
    EVT VVT = Op.getOperand(0).getValueType();
    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                       DAG.getUNDEF(VVT), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                       DAG.getIntPtrConstant(0));
  } else if (VT.getSizeInBits() == 64) {
    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
    //        to match extract_elt for f64.
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return Op;

    // UNPCKHPD the element to the lowest double word, then movsd.
    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
    int Mask[2] = { 1, -1 };
    EVT VVT = Op.getOperand(0).getValueType();
    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                       DAG.getUNDEF(VVT), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                       DAG.getIntPtrConstant(0));
  }

  return SDValue();
}

SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
                                               SelectionDAG &DAG) const {
  EVT VT = Op.getValueType();
  EVT EltVT = VT.getVectorElementType();
  DebugLoc dl = Op.getDebugLoc();

  SDValue N0 = Op.getOperand(0);
  SDValue N1 = Op.getOperand(1);
  SDValue N2 = Op.getOperand(2);

  if (VT.getSizeInBits() == 256)
    return SDValue();

  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
      isa<ConstantSDNode>(N2)) {
    unsigned Opc;
    if (VT == MVT::v8i16)
      Opc = X86ISD::PINSRW;
    else if (VT == MVT::v16i8)
      Opc = X86ISD::PINSRB;
    else
      Opc = X86ISD::PINSRB;

    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
    // argument.
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
    // Bits [7:6] of the constant are the source select.  This will always be
    //  zero here.  The DAG Combiner may combine an extract_elt index into these
    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
    // Bits [5:4] of the constant are the destination select.  This is the
    //  value of the incoming immediate.
    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
    //   combine either bitwise AND or insert of float 0.0 to set these bits.
    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
    // Create this as a scalar to vector..
    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
    // PINSR* works with constant index.
    return Op;
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
  EVT VT = Op.getValueType();
  EVT EltVT = VT.getVectorElementType();

  DebugLoc dl = Op.getDebugLoc();
  SDValue N0 = Op.getOperand(0);
  SDValue N1 = Op.getOperand(1);
  SDValue N2 = Op.getOperand(2);

  // If this is a 256-bit vector result, first extract the 128-bit vector,
  // insert the element into the extracted half and then place it back.
  if (VT.getSizeInBits() == 256) {
    if (!isa<ConstantSDNode>(N2))
      return SDValue();

    // Get the desired 128-bit vector half.
    unsigned NumElems = VT.getVectorNumElements();
    unsigned IdxVal = cast<ConstantSDNode>(N2)->getZExtValue();
    bool Upper = IdxVal >= NumElems/2;
    SDValue Ins128Idx = DAG.getConstant(Upper ? NumElems/2 : 0, MVT::i32);
    SDValue V = Extract128BitVector(N0, Ins128Idx, DAG, dl);

    // Insert the element into the desired half.
    V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V,
                 N1, Upper ? DAG.getConstant(IdxVal-NumElems/2, MVT::i32) : N2);

    // Insert the changed part back to the 256-bit vector
    return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
  }

  if (Subtarget->hasSSE41() || Subtarget->hasAVX())
    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);

  if (EltVT == MVT::i8)
    return SDValue();

  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
    // as its second argument.
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  LLVMContext *Context = DAG.getContext();
  DebugLoc dl = Op.getDebugLoc();
  EVT OpVT = Op.getValueType();

  // If this is a 256-bit vector result, first insert into a 128-bit
  // vector and then insert into the 256-bit vector.
  if (OpVT.getSizeInBits() > 128) {
    // Insert into a 128-bit vector.
    EVT VT128 = EVT::getVectorVT(*Context,
                                 OpVT.getVectorElementType(),
                                 OpVT.getVectorNumElements() / 2);

    Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

    // Insert the 128-bit vector.
    return Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, OpVT), Op,
                              DAG.getConstant(0, MVT::i32),
                              DAG, dl);
  }

  if (Op.getValueType() == MVT::v1i64 &&
      Op.getOperand(0).getValueType() == MVT::i64)
    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));

  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
  assert(Op.getValueType().getSimpleVT().getSizeInBits() == 128 &&
         "Expected an SSE type!");
  return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(),
                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,AnyExt));
}

// Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
// a simple subregister reference or explicit instructions to grab
// upper bits of a vector.
SDValue
X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
  if (Subtarget->hasAVX()) {
    DebugLoc dl = Op.getNode()->getDebugLoc();
    SDValue Vec = Op.getNode()->getOperand(0);
    SDValue Idx = Op.getNode()->getOperand(1);

    if (Op.getNode()->getValueType(0).getSizeInBits() == 128
        && Vec.getNode()->getValueType(0).getSizeInBits() == 256) {
        return Extract128BitVector(Vec, Idx, DAG, dl);
    }
  }
  return SDValue();
}

// Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
// simple superregister reference or explicit instructions to insert
// the upper bits of a vector.
SDValue
X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
  if (Subtarget->hasAVX()) {
    DebugLoc dl = Op.getNode()->getDebugLoc();
    SDValue Vec = Op.getNode()->getOperand(0);
    SDValue SubVec = Op.getNode()->getOperand(1);
    SDValue Idx = Op.getNode()->getOperand(2);

    if (Op.getNode()->getValueType(0).getSizeInBits() == 256
        && SubVec.getNode()->getValueType(0).getSizeInBits() == 128) {
      return Insert128BitVector(Vec, SubVec, Idx, DAG, dl);
    }
  }
  return SDValue();
}

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOV32ri.
SDValue
X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
  // global base reg.
  unsigned char OpFlag = 0;
  unsigned WrapperKind = X86ISD::Wrapper;
  CodeModel::Model M = getTargetMachine().getCodeModel();

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    WrapperKind = X86ISD::WrapperRIP;
  else if (Subtarget->isPICStyleGOT())
    OpFlag = X86II::MO_GOTOFF;
  else if (Subtarget->isPICStyleStubPIC())
    OpFlag = X86II::MO_PIC_BASE_OFFSET;

  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
                                             CP->getAlignment(),
                                             CP->getOffset(), OpFlag);
  DebugLoc DL = CP->getDebugLoc();
  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (OpFlag) {
    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc(), getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
  // global base reg.
  unsigned char OpFlag = 0;
  unsigned WrapperKind = X86ISD::Wrapper;
  CodeModel::Model M = getTargetMachine().getCodeModel();

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    WrapperKind = X86ISD::WrapperRIP;
  else if (Subtarget->isPICStyleGOT())
    OpFlag = X86II::MO_GOTOFF;
  else if (Subtarget->isPICStyleStubPIC())
    OpFlag = X86II::MO_PIC_BASE_OFFSET;

  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
                                          OpFlag);
  DebugLoc DL = JT->getDebugLoc();
  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);

  // With PIC, the address is actually $g + Offset.
  if (OpFlag)
    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc(), getPointerTy()),
                         Result);

  return Result;
}

SDValue
X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
  // global base reg.
  unsigned char OpFlag = 0;
  unsigned WrapperKind = X86ISD::Wrapper;
  CodeModel::Model M = getTargetMachine().getCodeModel();

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    WrapperKind = X86ISD::WrapperRIP;
  else if (Subtarget->isPICStyleGOT())
    OpFlag = X86II::MO_GOTOFF;
  else if (Subtarget->isPICStyleStubPIC())
    OpFlag = X86II::MO_PIC_BASE_OFFSET;

  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);

  DebugLoc DL = Op.getDebugLoc();
  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);


  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->is64Bit()) {
    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc(), getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue
X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
  // Create the TargetBlockAddressAddress node.
  unsigned char OpFlags =
    Subtarget->ClassifyBlockAddressReference();
  CodeModel::Model M = getTargetMachine().getCodeModel();
  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
  DebugLoc dl = Op.getDebugLoc();
  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
                                       /*isTarget=*/true, OpFlags);

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
  else
    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);

  // With PIC, the address is actually $g + Offset.
  if (isGlobalRelativeToPICBase(OpFlags)) {
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue
X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
                                      int64_t Offset,
                                      SelectionDAG &DAG) const {
  // Create the TargetGlobalAddress node, folding in the constant
  // offset if it is legal.
  unsigned char OpFlags =
    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
  CodeModel::Model M = getTargetMachine().getCodeModel();
  SDValue Result;
  if (OpFlags == X86II::MO_NO_FLAG &&
      X86::isOffsetSuitableForCodeModel(Offset, M)) {
    // A direct static reference to a global.
    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), Offset);
    Offset = 0;
  } else {
    Result = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), 0, OpFlags);
  }

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
  else
    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);

  // With PIC, the address is actually $g + Offset.
  if (isGlobalRelativeToPICBase(OpFlags)) {
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
                         Result);
  }

  // For globals that require a load from a stub to get the address, emit the
  // load.
  if (isGlobalStubReference(OpFlags))
    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
                         MachinePointerInfo::getGOT(), false, false, 0);

  // If there was a non-zero offset that we didn't fold, create an explicit
  // addition for it.
  if (Offset != 0)
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
                         DAG.getConstant(Offset, getPointerTy()));

  return Result;
}

SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
}

static SDValue
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
           unsigned char OperandFlags) {
  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
  DebugLoc dl = GA->getDebugLoc();
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                           GA->getValueType(0),
                                           GA->getOffset(),
                                           OperandFlags);
  if (InFlag) {
    SDValue Ops[] = { Chain,  TGA, *InFlag };
    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
  } else {
    SDValue Ops[]  = { Chain, TGA };
    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
  }

  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
  MFI->setAdjustsStack(true);

  SDValue Flag = Chain.getValue(1);
  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
}

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                const EVT PtrVT) {
  SDValue InFlag;
  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
                                     DAG.getNode(X86ISD::GlobalBaseReg,
                                                 DebugLoc(), PtrVT), InFlag);
  InFlag = Chain.getValue(1);

  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
}

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                const EVT PtrVT) {
  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
                    X86::RAX, X86II::MO_TLSGD);
}

// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
// "local exec" model.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                   const EVT PtrVT, TLSModel::Model model,
                                   bool is64Bit) {
  DebugLoc dl = GA->getDebugLoc();

  // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
  Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
                                                         is64Bit ? 257 : 256));

  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                                      DAG.getIntPtrConstant(0),
                                      MachinePointerInfo(Ptr), false, false, 0);

  unsigned char OperandFlags = 0;
  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
  // initialexec.
  unsigned WrapperKind = X86ISD::Wrapper;
  if (model == TLSModel::LocalExec) {
    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
  } else if (is64Bit) {
    assert(model == TLSModel::InitialExec);
    OperandFlags = X86II::MO_GOTTPOFF;
    WrapperKind = X86ISD::WrapperRIP;
  } else {
    assert(model == TLSModel::InitialExec);
    OperandFlags = X86II::MO_INDNTPOFF;
  }

  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
  // exec)
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
                                           GA->getValueType(0),
                                           GA->getOffset(), OperandFlags);
  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

  if (model == TLSModel::InitialExec)
    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
                         MachinePointerInfo::getGOT(), false, false, 0);

  // The address of the thread local variable is the add of the thread
  // pointer with the offset of the variable.
  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
}

SDValue
X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  const GlobalValue *GV = GA->getGlobal();

  if (Subtarget->isTargetELF()) {
    // TODO: implement the "local dynamic" model
    // TODO: implement the "initial exec"model for pic executables

    // If GV is an alias then use the aliasee for determining
    // thread-localness.
    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
      GV = GA->resolveAliasedGlobal(false);

    TLSModel::Model model
      = getTLSModel(GV, getTargetMachine().getRelocationModel());

    switch (model) {
      case TLSModel::GeneralDynamic:
      case TLSModel::LocalDynamic: // not implemented
        if (Subtarget->is64Bit())
          return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
        return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());

      case TLSModel::InitialExec:
      case TLSModel::LocalExec:
        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
                                   Subtarget->is64Bit());
    }
  } else if (Subtarget->isTargetDarwin()) {
    // Darwin only has one model of TLS.  Lower to that.
    unsigned char OpFlag = 0;
    unsigned WrapperKind = Subtarget->isPICStyleRIPRel() ?
                           X86ISD::WrapperRIP : X86ISD::Wrapper;

    // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
    // global base reg.
    bool PIC32 = (getTargetMachine().getRelocationModel() == Reloc::PIC_) &&
                  !Subtarget->is64Bit();
    if (PIC32)
      OpFlag = X86II::MO_TLVP_PIC_BASE;
    else
      OpFlag = X86II::MO_TLVP;
    DebugLoc DL = Op.getDebugLoc();
    SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
                                                GA->getValueType(0),
                                                GA->getOffset(), OpFlag);
    SDValue Offset = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);

    // With PIC32, the address is actually $g + Offset.
    if (PIC32)
      Offset = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                           DAG.getNode(X86ISD::GlobalBaseReg,
                                       DebugLoc(), getPointerTy()),
                           Offset);

    // Lowering the machine isd will make sure everything is in the right
    // location.
    SDValue Chain = DAG.getEntryNode();
    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
    SDValue Args[] = { Chain, Offset };
    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);

    // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
    MFI->setAdjustsStack(true);

    // And our return value (tls address) is in the standard call return value
    // location.
    unsigned Reg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
    return DAG.getCopyFromReg(Chain, DL, Reg, getPointerTy());
  }

  assert(false &&
         "TLS not implemented for this target.");

  llvm_unreachable("Unreachable");
  return SDValue();
}


/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
/// take a 2 x i32 value to shift plus a shift amount.
SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  EVT VT = Op.getValueType();
  unsigned VTBits = VT.getSizeInBits();
  DebugLoc dl = Op.getDebugLoc();
  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
  SDValue ShOpLo = Op.getOperand(0);
  SDValue ShOpHi = Op.getOperand(1);
  SDValue ShAmt  = Op.getOperand(2);
  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
                                     DAG.getConstant(VTBits - 1, MVT::i8))
                       : DAG.getConstant(0, VT);

  SDValue Tmp2, Tmp3;
  if (Op.getOpcode() == ISD::SHL_PARTS) {
    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
  } else {
    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
  }

  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                DAG.getConstant(VTBits, MVT::i8));
  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
                             AndNode, DAG.getConstant(0, MVT::i8));

  SDValue Hi, Lo;
  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

  if (Op.getOpcode() == ISD::SHL_PARTS) {
    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
  } else {
    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
  }

  SDValue Ops[2] = { Lo, Hi };
  return DAG.getMergeValues(Ops, 2, dl);
}

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
                                           SelectionDAG &DAG) const {
  EVT SrcVT = Op.getOperand(0).getValueType();

  if (SrcVT.isVector())
    return SDValue();