X86ISelLowering.cpp

    } else {
      Mask1[0] = DAG.getConstant(HiIndex & 1 ? 2 : 0, MaskEVT);
      Mask1[1] = DAG.getConstant(HiIndex & 1 ? 0 : 2, MaskEVT);
      Mask1[2] = PermMask.getOperand(2);
      Mask1[3] = PermMask.getOperand(3);
      if (Mask1[2].getOpcode() != ISD::UNDEF)
        Mask1[2] =
          DAG.getConstant(cast<ConstantSDNode>(Mask1[2])->getZExtValue()+4,
                          MaskEVT);
      if (Mask1[3].getOpcode() != ISD::UNDEF)
        Mask1[3] =
          DAG.getConstant(cast<ConstantSDNode>(Mask1[3])->getZExtValue()+4,
                          MaskEVT);
      return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V2, V1,
                         DAG.getBUILD_VECTOR(MaskVT, dl, &Mask1[0], 4));
    }
  }

  // Break it into (shuffle shuffle_hi, shuffle_lo).
  Locs.clear();
  SmallVector<SDValue,8> LoMask(4, DAG.getUNDEF(MaskEVT));
  SmallVector<SDValue,8> HiMask(4, DAG.getUNDEF(MaskEVT));
  SmallVector<SDValue,8> *MaskPtr = &LoMask;
  unsigned MaskIdx = 0;
  unsigned LoIdx = 0;
  unsigned HiIdx = 2;
  for (unsigned i = 0; i != 4; ++i) {
    if (i == 2) {
      MaskPtr = &HiMask;
      MaskIdx = 1;
      LoIdx = 0;
      HiIdx = 2;
    }
    SDValue Elt = PermMask.getOperand(i);
    if (Elt.getOpcode() == ISD::UNDEF) {
      Locs[i] = std::make_pair(-1, -1);
    } else if (cast<ConstantSDNode>(Elt)->getZExtValue() < 4) {
      Locs[i] = std::make_pair(MaskIdx, LoIdx);
      (*MaskPtr)[LoIdx] = Elt;
      LoIdx++;
    } else {
      Locs[i] = std::make_pair(MaskIdx, HiIdx);
      (*MaskPtr)[HiIdx] = Elt;
      HiIdx++;
    }
  }

  SDValue LoShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2,
                                  DAG.getBUILD_VECTOR(MaskVT, dl,
                                                &LoMask[0], LoMask.size()));
  SDValue HiShuffle = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2,
                                  DAG.getBUILD_VECTOR(MaskVT, dl,
                                                &HiMask[0], HiMask.size()));
  SmallVector<SDValue, 8> MaskOps;
  for (unsigned i = 0; i != 4; ++i) {
    if (Locs[i].first == -1) {
      MaskOps.push_back(DAG.getUNDEF(MaskEVT));
    } else {
      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
      MaskOps.push_back(DAG.getConstant(Idx, MaskEVT));
    }
  }
  return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, LoShuffle, HiShuffle,
                     DAG.getBUILD_VECTOR(MaskVT, dl, &MaskOps[0], MaskOps.size()));
}

SDValue
X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  SDValue V1 = Op.getOperand(0);
  SDValue V2 = Op.getOperand(1);
  SDValue PermMask = Op.getOperand(2);
  MVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  unsigned NumElems = PermMask.getNumOperands();
  bool isMMX = VT.getSizeInBits() == 64;
  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
  bool V1IsSplat = false;
  bool V2IsSplat = false;

  if (isUndefShuffle(Op.getNode()))
    return DAG.getUNDEF(VT);

  if (isZeroShuffle(Op.getNode()))
    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);

  if (isIdentityMask(PermMask.getNode()))
    return V1;
  else if (isIdentityMask(PermMask.getNode(), true))
    return V2;

  // Canonicalize movddup shuffles.
  if (V2IsUndef && Subtarget->hasSSE2() &&
      VT.getSizeInBits() == 128 &&
      X86::isMOVDDUPMask(PermMask.getNode()))
    return CanonicalizeMovddup(Op, V1, PermMask, DAG, Subtarget->hasSSE3());

  if (isSplatMask(PermMask.getNode())) {
    if (isMMX || NumElems < 4) return Op;
    // Promote it to a v4{if}32 splat.
    return PromoteSplat(Op, DAG, Subtarget->hasSSE2());
  }

  // If the shuffle can be profitably rewritten as a narrower shuffle, then
  // do it!
  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
    SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG,
                                            *this, dl);
    if (NewOp.getNode())
      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
                         LowerVECTOR_SHUFFLE(NewOp, DAG));
  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
    // FIXME: Figure out a cleaner way to do this.
    // Try to make use of movq to zero out the top part.
    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
      SDValue NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
                                                 DAG, *this, dl);
      if (NewOp.getNode()) {
        SDValue NewV1 = NewOp.getOperand(0);
        SDValue NewV2 = NewOp.getOperand(1);
        SDValue NewMask = NewOp.getOperand(2);
        if (isCommutedMOVL(NewMask.getNode(), true, false)) {
          NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG);
          return getVZextMovL(VT, NewOp.getValueType(), NewV2, DAG, Subtarget,
                              dl);
        }
      }
    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
      SDValue NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask,
                                                DAG, *this, dl);
      if (NewOp.getNode() && X86::isMOVLMask(NewOp.getOperand(2).getNode()))
        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
                             DAG, Subtarget, dl);
    }
  }

  // Check if this can be converted into a logical shift.
  bool isLeft = false;
  unsigned ShAmt = 0;
  SDValue ShVal;
  bool isShift = isVectorShift(Op, PermMask, DAG, isLeft, ShVal, ShAmt);
  if (isShift && ShVal.hasOneUse()) {
    // If the shifted value has multiple uses, it may be cheaper to use
    // v_set0 + movlhps or movhlps, etc.
    MVT EVT = VT.getVectorElementType();
    ShAmt *= EVT.getSizeInBits();
    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
  }

  if (X86::isMOVLMask(PermMask.getNode())) {
    if (V1IsUndef)
      return V2;
    if (ISD::isBuildVectorAllZeros(V1.getNode()))
      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
    if (!isMMX)
      return Op;
  }

  if (!isMMX && (X86::isMOVSHDUPMask(PermMask.getNode()) ||
                 X86::isMOVSLDUPMask(PermMask.getNode()) ||
                 X86::isMOVHLPSMask(PermMask.getNode()) ||
                 X86::isMOVHPMask(PermMask.getNode()) ||
                 X86::isMOVLPMask(PermMask.getNode())))
    return Op;

  if (ShouldXformToMOVHLPS(PermMask.getNode()) ||
      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), PermMask.getNode()))
    return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);

  if (isShift) {
    // No better options. Use a vshl / vsrl.
    MVT EVT = VT.getVectorElementType();
    ShAmt *= EVT.getSizeInBits();
    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
  }

  bool Commuted = false;
  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
  // 1,1,1,1 -> v8i16 though.
  V1IsSplat = isSplatVector(V1.getNode());
  V2IsSplat = isSplatVector(V2.getNode());

  // Canonicalize the splat or undef, if present, to be on the RHS.
  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    std::swap(V1IsSplat, V2IsSplat);
    std::swap(V1IsUndef, V2IsUndef);
    Commuted = true;
  }

  // FIXME: Figure out a cleaner way to do this.
  if (isCommutedMOVL(PermMask.getNode(), V2IsSplat, V2IsUndef)) {
    if (V2IsUndef) return V1;
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    if (V2IsSplat) {
      // V2 is a splat, so the mask may be malformed. That is, it may point
      // to any V2 element. The instruction selectior won't like this. Get
      // a corrected mask and commute to form a proper MOVS{S|D}.
      SDValue NewMask = getMOVLMask(NumElems, DAG, dl);
      if (NewMask.getNode() != PermMask.getNode())
        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask);
    }
    return Op;
  }

  if (X86::isUNPCKL_v_undef_Mask(PermMask.getNode()) ||
      X86::isUNPCKH_v_undef_Mask(PermMask.getNode()) ||
      X86::isUNPCKLMask(PermMask.getNode()) ||
      X86::isUNPCKHMask(PermMask.getNode()))
    return Op;

  if (V2IsSplat) {
    // Normalize mask so all entries that point to V2 points to its first
    // element then try to match unpck{h|l} again. If match, return a
    // new vector_shuffle with the corrected mask.
    SDValue NewMask = NormalizeMask(PermMask, DAG);
    if (NewMask.getNode() != PermMask.getNode()) {
      if (X86::isUNPCKLMask(NewMask.getNode(), true)) {
        SDValue NewMask = getUnpacklMask(NumElems, DAG, dl);
        return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask);
      } else if (X86::isUNPCKHMask(NewMask.getNode(), true)) {
        SDValue NewMask = getUnpackhMask(NumElems, DAG, dl);
        return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1, V2, NewMask);
      }
    }
  }

  // Normalize the node to match x86 shuffle ops if needed
  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.getNode()))
      Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);

  if (Commuted) {
    // Commute is back and try unpck* again.
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    if (X86::isUNPCKL_v_undef_Mask(PermMask.getNode()) ||
        X86::isUNPCKH_v_undef_Mask(PermMask.getNode()) ||
        X86::isUNPCKLMask(PermMask.getNode()) ||
        X86::isUNPCKHMask(PermMask.getNode()))
      return Op;
  }

  // Try PSHUF* first, then SHUFP*.
  // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically
  // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
  if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.getNode())) {
    if (V2.getOpcode() != ISD::UNDEF)
      return DAG.getNode(ISD::VECTOR_SHUFFLE, dl, VT, V1,
                         DAG.getUNDEF(VT), PermMask);
    return Op;
  }

  if (!isMMX) {
    if (Subtarget->hasSSE2() &&
        (X86::isPSHUFDMask(PermMask.getNode()) ||
         X86::isPSHUFHWMask(PermMask.getNode()) ||
         X86::isPSHUFLWMask(PermMask.getNode()))) {
      MVT RVT = VT;
      if (VT == MVT::v4f32) {
        RVT = MVT::v4i32;
        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, RVT,
                         DAG.getNode(ISD::BIT_CONVERT, dl, RVT, V1),
                         DAG.getUNDEF(RVT), PermMask);
      } else if (V2.getOpcode() != ISD::UNDEF)
        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, RVT, V1,
                         DAG.getUNDEF(RVT), PermMask);
      if (RVT != VT)
        Op = DAG.getNode(ISD::BIT_CONVERT, dl, VT, Op);
      return Op;
    }

    // Binary or unary shufps.
    if (X86::isSHUFPMask(PermMask.getNode()) ||
        (V2.getOpcode() == ISD::UNDEF && X86::isPSHUFDMask(PermMask.getNode())))
      return Op;
  }

  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
  if (VT == MVT::v8i16) {
    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this, dl);
    if (NewOp.getNode())
      return NewOp;
  }

  // Handle all 4 wide cases with a number of shuffles except for MMX.
  if (NumElems == 4 && !isMMX)
    return LowerVECTOR_SHUFFLE_4wide(V1, V2, PermMask, VT, DAG, dl);

  return SDValue();
}

SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                SelectionDAG &DAG) {
  MVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  if (VT.getSizeInBits() == 8) {
    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT.getSizeInBits() == 16) {
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
    if (Idx == 0)
      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                     DAG.getNode(ISD::BIT_CONVERT, dl,
                                                 MVT::v4i32,
                                                 Op.getOperand(0)),
                                     Op.getOperand(1)));
    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT == MVT::f32) {
    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
    // the result back to FR32 register. It's only worth matching if the
    // result has a single use which is a store or a bitcast to i32.  And in
    // the case of a store, it's not worth it if the index is a constant 0,
    // because a MOVSSmr can be used instead, which is smaller and faster.
    if (!Op.hasOneUse())
      return SDValue();
    SDNode *User = *Op.getNode()->use_begin();
    if ((User->getOpcode() != ISD::STORE ||
         (isa<ConstantSDNode>(Op.getOperand(1)) &&
          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
        (User->getOpcode() != ISD::BIT_CONVERT ||
         User->getValueType(0) != MVT::i32))
      return SDValue();
    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
                                              Op.getOperand(0)),
                                              Op.getOperand(1));
    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
  } else if (VT == MVT::i32) {
    // ExtractPS works with constant index.
    if (isa<ConstantSDNode>(Op.getOperand(1)))
      return Op;
  }
  return SDValue();
}


SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
  if (!isa<ConstantSDNode>(Op.getOperand(1)))
    return SDValue();

  if (Subtarget->hasSSE41()) {
    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
    if (Res.getNode())
      return Res;
  }

  MVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  // TODO: handle v16i8.
  if (VT.getSizeInBits() == 16) {
    SDValue Vec = Op.getOperand(0);
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                     DAG.getNode(ISD::BIT_CONVERT, dl,
                                                 MVT::v4i32, Vec),
                                     Op.getOperand(1)));
    // Transform it so it match pextrw which produces a 32-bit result.
    MVT EVT = (MVT::SimpleValueType)(VT.getSimpleVT()+1);
    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EVT,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EVT, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT.getSizeInBits() == 32) {
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return Op;
    // SHUFPS the element to the lowest double word, then movss.
    MVT MaskVT = MVT::getIntVectorWithNumElements(4);
    SmallVector<SDValue, 8> IdxVec;
    IdxVec.
      push_back(DAG.getConstant(Idx, MaskVT.getVectorElementType()));
    IdxVec.
      push_back(DAG.getUNDEF(MaskVT.getVectorElementType()));
    IdxVec.
      push_back(DAG.getUNDEF(MaskVT.getVectorElementType()));
    IdxVec.
      push_back(DAG.getUNDEF(MaskVT.getVectorElementType()));
    SDValue Mask = DAG.getBUILD_VECTOR(MaskVT, dl, &IdxVec[0], IdxVec.size());
    SDValue Vec = Op.getOperand(0);
    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, Vec.getValueType(),
                      Vec, DAG.getUNDEF(Vec.getValueType()), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                       DAG.getIntPtrConstant(0));
  } else if (VT.getSizeInBits() == 64) {
    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
    //        to match extract_elt for f64.
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return Op;

    // UNPCKHPD the element to the lowest double word, then movsd.
    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
    MVT MaskVT = MVT::getIntVectorWithNumElements(2);
    SmallVector<SDValue, 8> IdxVec;
    IdxVec.push_back(DAG.getConstant(1, MaskVT.getVectorElementType()));
    IdxVec.
      push_back(DAG.getUNDEF(MaskVT.getVectorElementType()));
    SDValue Mask = DAG.getBUILD_VECTOR(MaskVT, dl, &IdxVec[0], IdxVec.size());
    SDValue Vec = Op.getOperand(0);
    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, Vec.getValueType(),
                      Vec, DAG.getUNDEF(Vec.getValueType()),
                      Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                       DAG.getIntPtrConstant(0));
  }

  return SDValue();
}

SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
  MVT VT = Op.getValueType();
  MVT EVT = VT.getVectorElementType();
  DebugLoc dl = Op.getDebugLoc();

  SDValue N0 = Op.getOperand(0);
  SDValue N1 = Op.getOperand(1);
  SDValue N2 = Op.getOperand(2);

  if ((EVT.getSizeInBits() == 8 || EVT.getSizeInBits() == 16) &&
      isa<ConstantSDNode>(N2)) {
    unsigned Opc = (EVT.getSizeInBits() == 8) ? X86ISD::PINSRB
                                                  : X86ISD::PINSRW;
    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
    // argument.
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
  } else if (EVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
    // Bits [7:6] of the constant are the source select.  This will always be
    //  zero here.  The DAG Combiner may combine an extract_elt index into these
    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
    // Bits [5:4] of the constant are the destination select.  This is the
    //  value of the incoming immediate.
    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
    //   combine either bitwise AND or insert of float 0.0 to set these bits.
    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
  } else if (EVT == MVT::i32) {
    // InsertPS works with constant index.
    if (isa<ConstantSDNode>(N2))
      return Op;
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
  MVT VT = Op.getValueType();
  MVT EVT = VT.getVectorElementType();

  if (Subtarget->hasSSE41())
    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);

  if (EVT == MVT::i8)
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();
  SDValue N0 = Op.getOperand(0);
  SDValue N1 = Op.getOperand(1);
  SDValue N2 = Op.getOperand(2);

  if (EVT.getSizeInBits() == 16) {
    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
    // as its second argument.
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  DebugLoc dl = Op.getDebugLoc();
  if (Op.getValueType() == MVT::v2f32)
    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
                                               Op.getOperand(0))));

  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
  MVT VT = MVT::v2i32;
  switch (Op.getValueType().getSimpleVT()) {
  default: break;
  case MVT::v16i8:
  case MVT::v8i16:
    VT = MVT::v4i32;
    break;
  }
  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
}

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOV32ri.
SDValue
X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  // FIXME there isn't really any debug info here, should come from the parent
  DebugLoc dl = CP->getDebugLoc();
  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(),
                                               getPointerTy(),
                                               CP->getAlignment());
  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc::getUnknownLoc(),
                                     getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue
X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
                                      int64_t Offset,
                                      SelectionDAG &DAG) const {
  bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_;
  bool ExtraLoadRequired =
    Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false);

  // Create the TargetGlobalAddress node, folding in the constant
  // offset if it is legal.
  SDValue Result;
  if (!IsPic && !ExtraLoadRequired && isInt32(Offset)) {
    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
    Offset = 0;
  } else
    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0);
  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);

  // With PIC, the address is actually $g + Offset.
  if (IsPic && !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
                         Result);
  }

  // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
  // load the value at address GV, not the value of GV itself. This means that
  // the GlobalAddress must be in the base or index register of the address, not
  // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
  // The same applies for external symbols during PIC codegen
  if (ExtraLoadRequired)
    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
                         PseudoSourceValue::getGOT(), 0);

  // If there was a non-zero offset that we didn't fold, create an explicit
  // addition for it.
  if (Offset != 0)
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
                         DAG.getConstant(Offset, getPointerTy()));

  return Result;
}

SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
}

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                const MVT PtrVT) {
  SDValue InFlag;
  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
                                     DAG.getNode(X86ISD::GlobalBaseReg,
                                                 DebugLoc::getUnknownLoc(),
                                                 PtrVT), InFlag);
  InFlag = Chain.getValue(1);

  // emit leal symbol@TLSGD(,%ebx,1), %eax
  SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
                                             GA->getValueType(0),
                                             GA->getOffset());
  SDValue Ops[] = { Chain,  TGA, InFlag };
  SDValue Result = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
  InFlag = Result.getValue(2);
  Chain = Result.getValue(1);

  // call ___tls_get_addr. This function receives its argument in
  // the register EAX.
  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Result, InFlag);
  InFlag = Chain.getValue(1);

  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  SDValue Ops1[] = { Chain,
                      DAG.getTargetExternalSymbol("___tls_get_addr",
                                                  PtrVT),
                      DAG.getRegister(X86::EAX, PtrVT),
                      DAG.getRegister(X86::EBX, PtrVT),
                      InFlag };
  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops1, 5);
  InFlag = Chain.getValue(1);

  return DAG.getCopyFromReg(Chain, dl, X86::EAX, PtrVT, InFlag);
}

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                const MVT PtrVT) {
  SDValue InFlag, Chain;
  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better

  // emit leaq symbol@TLSGD(%rip), %rdi
  SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
                                             GA->getValueType(0),
                                             GA->getOffset());
  SDValue Ops[]  = { DAG.getEntryNode(), TGA};
  SDValue Result = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
  Chain  = Result.getValue(1);
  InFlag = Result.getValue(2);

  // call __tls_get_addr. This function receives its argument in
  // the register RDI.
  Chain = DAG.getCopyToReg(Chain, dl, X86::RDI, Result, InFlag);
  InFlag = Chain.getValue(1);

  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  SDValue Ops1[] = { Chain,
                      DAG.getTargetExternalSymbol("__tls_get_addr",
                                                  PtrVT),
                      DAG.getRegister(X86::RDI, PtrVT),
                      InFlag };
  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops1, 4);
  InFlag = Chain.getValue(1);

  return DAG.getCopyFromReg(Chain, dl, X86::RAX, PtrVT, InFlag);
}

// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
// "local exec" model.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                     const MVT PtrVT) {
  DebugLoc dl = GA->getDebugLoc();
  // Get the Thread Pointer
  SDValue ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER,
                                      DebugLoc::getUnknownLoc(), PtrVT);
  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
  // exec)
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
                                             GA->getValueType(0),
                                             GA->getOffset());
  SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

  if (GA->getGlobal()->isDeclaration()) // initial exec TLS model
    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
                         PseudoSourceValue::getGOT(), 0);

  // The address of the thread local variable is the add of the thread
  // pointer with the offset of the variable.
  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
}

SDValue
X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
  // TODO: implement the "local dynamic" model
  // TODO: implement the "initial exec"model for pic executables
  assert(Subtarget->isTargetELF() &&
         "TLS not implemented for non-ELF targets");
  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
  // otherwise use the "Local Exec"TLS Model
  if (Subtarget->is64Bit()) {
    return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
  } else {
    if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
      return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
    else
      return LowerToTLSExecModel(GA, DAG, getPointerTy());
  }
}

SDValue
X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
  // FIXME there isn't really any debug info here
  DebugLoc dl = Op.getDebugLoc();
  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc::getUnknownLoc(),
                                     getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
  // FIXME there isn't really any debug into here
  DebugLoc dl = JT->getDebugLoc();
  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
  Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc::getUnknownLoc(),
                                     getPointerTy()),
                         Result);
  }

  return Result;
}

/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
/// take a 2 x i32 value to shift plus a shift amount.
SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
  MVT VT = Op.getValueType();
  unsigned VTBits = VT.getSizeInBits();
  DebugLoc dl = Op.getDebugLoc();
  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
  SDValue ShOpLo = Op.getOperand(0);
  SDValue ShOpHi = Op.getOperand(1);
  SDValue ShAmt  = Op.getOperand(2);
  SDValue Tmp1 = isSRA ?
    DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
                DAG.getConstant(VTBits - 1, MVT::i8)) :
    DAG.getConstant(0, VT);

  SDValue Tmp2, Tmp3;
  if (Op.getOpcode() == ISD::SHL_PARTS) {
    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
  } else {
    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
  }

  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
                                  DAG.getConstant(VTBits, MVT::i8));
  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
                               AndNode, DAG.getConstant(0, MVT::i8));

  SDValue Hi, Lo;
  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

  if (Op.getOpcode() == ISD::SHL_PARTS) {
    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
  } else {
    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
  }

  SDValue Ops[2] = { Lo, Hi };
  return DAG.getMergeValues(Ops, 2, dl);
}

SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
  MVT SrcVT = Op.getOperand(0).getValueType();
  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
         "Unknown SINT_TO_FP to lower!");

  // These are really Legal; caller falls through into that case.
  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
    return SDValue();
  if (SrcVT == MVT::i64 && Op.getValueType() != MVT::f80 &&
      Subtarget->is64Bit())
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();
  unsigned Size = SrcVT.getSizeInBits()/8;
  MachineFunction &MF = DAG.getMachineFunction();
  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                 StackSlot,
                                 PseudoSourceValue::getFixedStack(SSFI), 0);

  // Build the FILD
  SDVTList Tys;
  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
  if (useSSE)
    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
  else
    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
  SmallVector<SDValue, 8> Ops;
  Ops.push_back(Chain);
  Ops.push_back(StackSlot);
  Ops.push_back(DAG.getValueType(SrcVT));
  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
                                 Tys, &Ops[0], Ops.size());

  if (useSSE) {
    Chain = Result.getValue(1);
    SDValue InFlag = Result.getValue(2);

    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
    // shouldn't be necessary except that RFP cannot be live across
    // multiple blocks. When stackifier is fixed, they can be uncoupled.
    MachineFunction &MF = DAG.getMachineFunction();
    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
    Tys = DAG.getVTList(MVT::Other);
    SmallVector<SDValue, 8> Ops;
    Ops.push_back(Chain);
    Ops.push_back(Result);
    Ops.push_back(StackSlot);
    Ops.push_back(DAG.getValueType(Op.getValueType()));
    Ops.push_back(InFlag);
    Chain = DAG.getNode(X86ISD::FST, dl, Tys, &Ops[0], Ops.size());
    Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
                         PseudoSourceValue::getFixedStack(SSFI), 0);
  }

  return Result;
}

// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
  // This algorithm is not obvious. Here it is in C code, more or less:
  /*
    double uint64_to_double( uint32_t hi, uint32_t lo ) {
      static const __m128i exp = { 0x4330000045300000ULL, 0 };
      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };

      // Copy ints to xmm registers.
      __m128i xh = _mm_cvtsi32_si128( hi );
      __m128i xl = _mm_cvtsi32_si128( lo );

      // Combine into low half of a single xmm register.
      __m128i x = _mm_unpacklo_epi32( xh, xl );
      __m128d d;
      double sd;

      // Merge in appropriate exponents to give the integer bits the right
      // magnitude.
      x = _mm_unpacklo_epi32( x, exp );

      // Subtract away the biases to deal with the IEEE-754 double precision
      // implicit 1.
      d = _mm_sub_pd( (__m128d) x, bias );

      // All conversions up to here are exact. The correctly rounded result is
      // calculated using the current rounding mode using the following
      // horizontal add.
      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
                                // store doesn't really need to be here (except
                                // maybe to zero the other double)
      return sd;
    }
  */

  DebugLoc dl = Op.getDebugLoc();

  // Build some magic constants.
  std::vector<Constant*> CV0;
  CV0.push_back(ConstantInt::get(APInt(32, 0x45300000)));
  CV0.push_back(ConstantInt::get(APInt(32, 0x43300000)));
  CV0.push_back(ConstantInt::get(APInt(32, 0)));
  CV0.push_back(ConstantInt::get(APInt(32, 0)));
  Constant *C0 = ConstantVector::get(CV0);
  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 4);

  std::vector<Constant*> CV1;
  CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4530000000000000ULL))));
  CV1.push_back(ConstantFP::get(APFloat(APInt(64, 0x4330000000000000ULL))));
  Constant *C1 = ConstantVector::get(CV1);
  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 4);

  SmallVector<SDValue, 4> MaskVec;
  MaskVec.push_back(DAG.getConstant(0, MVT::i32));
  MaskVec.push_back(DAG.getConstant(4, MVT::i32));
  MaskVec.push_back(DAG.getConstant(1, MVT::i32));
  MaskVec.push_back(DAG.getConstant(5, MVT::i32));
  SDValue UnpcklMask = DAG.getBUILD_VECTOR(MVT::v4i32, dl,
                                           &MaskVec[0], MaskVec.size());
  SmallVector<SDValue, 4> MaskVec2;
  MaskVec2.push_back(DAG.getConstant(1, MVT::i32));
  MaskVec2.push_back(DAG.getConstant(0, MVT::i32));
  SDValue ShufMask = DAG.getBUILD_VECTOR(MVT::v2i32, dl,
                                         &MaskVec2[0], MaskVec2.size());

  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                                        Op.getOperand(0),
                                        DAG.getIntPtrConstant(1)));
  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                                        Op.getOperand(0),
                                        DAG.getIntPtrConstant(0)));
  SDValue Unpck1 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v4i32,
                                XR1, XR2, UnpcklMask);
  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                              PseudoSourceValue::getConstantPool(), 0,
                              false, 16);
  SDValue Unpck2 = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v4i32,
                               Unpck1, CLod0, UnpcklMask);
  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                              PseudoSourceValue::getConstantPool(), 0,
                              false, 16);
  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

  // Add the halves; easiest way is to swap them into another reg first.
  SDValue Shuf = DAG.getNode(ISD::VECTOR_SHUFFLE, dl, MVT::v2f64,
                             Sub, Sub, ShufMask);
  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
                     DAG.getIntPtrConstant(0));
}

// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
  DebugLoc dl = Op.getDebugLoc();
  // FP constant to bias correct the final result.
  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
                                   MVT::f64);

  // Load the 32-bit value into an XMM register.
  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                                         Op.getOperand(0),
                                         DAG.getIntPtrConstant(0)));

  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
                     DAG.getIntPtrConstant(0));

  // Or the load with the bias.
  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                   MVT::v2f64, Load)),
                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                   MVT::v2f64, Bias)));
  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
                   DAG.getIntPtrConstant(0));

  // Subtract the bias.
  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

  // Handle final rounding.
  MVT DestVT = Op.getValueType();

  if (DestVT.bitsLT(MVT::f64)) {
    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
                       DAG.getIntPtrConstant(0));
  } else if (DestVT.bitsGT(MVT::f64)) {
    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
  }

  // Handle final rounding.
  return Sub;
}

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
  SDValue N0 = Op.getOperand(0);
  DebugLoc dl = Op.getDebugLoc();