X86ISelLowering.cpp

    }

    if ((i & 1) != 0) {
      SDOperand ThisElt(0, 0), LastElt(0, 0);
      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
      if (LastIsNonZero) {
        LastElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i-1));
      }
      if (ThisIsNonZero) {
        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, MVT::i16, Op.getOperand(i));
        ThisElt = DAG.getNode(ISD::SHL, MVT::i16,
                              ThisElt, DAG.getConstant(8, MVT::i8));
        if (LastIsNonZero)
          ThisElt = DAG.getNode(ISD::OR, MVT::i16, ThisElt, LastElt);
      } else
        ThisElt = LastElt;

      if (ThisElt.Val)
        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, ThisElt,
                        DAG.getConstant(i/2, TLI.getPointerTy()));
    }
  }

  return DAG.getNode(ISD::BIT_CONVERT, MVT::v16i8, V);
}

/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
///
static SDOperand LowerBuildVectorv8i16(SDOperand Op, unsigned NonZeros,
                                       unsigned NumNonZero, unsigned NumZero,
                                       SelectionDAG &DAG, TargetLowering &TLI) {
  if (NumNonZero > 4)
    return SDOperand();

  SDOperand V(0, 0);
  bool First = true;
  for (unsigned i = 0; i < 8; ++i) {
    bool isNonZero = (NonZeros & (1 << i)) != 0;
    if (isNonZero) {
      if (First) {
        if (NumZero)
          V = getZeroVector(MVT::v8i16, DAG);
        else
          V = DAG.getNode(ISD::UNDEF, MVT::v8i16);
        First = false;
      }
      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, V, Op.getOperand(i),
                      DAG.getConstant(i, TLI.getPointerTy()));
    }
  }

  return V;
}

SDOperand
X86TargetLowering::LowerBUILD_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  // All zero's are handled with pxor.
  if (ISD::isBuildVectorAllZeros(Op.Val))
    return Op;

  // All one's are handled with pcmpeqd.
  if (ISD::isBuildVectorAllOnes(Op.Val))
    return Op;

  MVT::ValueType VT = Op.getValueType();
  MVT::ValueType EVT = MVT::getVectorElementType(VT);
  unsigned EVTBits = MVT::getSizeInBits(EVT);

  unsigned NumElems = Op.getNumOperands();
  unsigned NumZero  = 0;
  unsigned NumNonZero = 0;
  unsigned NonZeros = 0;
  unsigned NumNonZeroImms = 0;
  std::set<SDOperand> Values;
  for (unsigned i = 0; i < NumElems; ++i) {
    SDOperand Elt = Op.getOperand(i);
    if (Elt.getOpcode() != ISD::UNDEF) {
      Values.insert(Elt);
      if (isZeroNode(Elt))
        NumZero++;
      else {
        NonZeros |= (1 << i);
        NumNonZero++;
        if (Elt.getOpcode() == ISD::Constant ||
            Elt.getOpcode() == ISD::ConstantFP)
          NumNonZeroImms++;
      }
    }
  }

  if (NumNonZero == 0) {
    if (NumZero == 0)
      // All undef vector. Return an UNDEF.
      return DAG.getNode(ISD::UNDEF, VT);
    else
      // A mix of zero and undef. Return a zero vector.
      return getZeroVector(VT, DAG);
  }

  // Splat is obviously ok. Let legalizer expand it to a shuffle.
  if (Values.size() == 1)
    return SDOperand();

  // Special case for single non-zero element.
  if (NumNonZero == 1) {
    unsigned Idx = CountTrailingZeros_32(NonZeros);
    SDOperand Item = Op.getOperand(Idx);
    Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
    if (Idx == 0)
      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
      return getShuffleVectorZeroOrUndef(Item, VT, NumElems, Idx,
                                         NumZero > 0, DAG);

    if (EVTBits == 32) {
      // Turn it into a shuffle of zero and zero-extended scalar to vector.
      Item = getShuffleVectorZeroOrUndef(Item, VT, NumElems, 0, NumZero > 0,
                                         DAG);
      MVT::ValueType MaskVT  = MVT::getIntVectorWithNumElements(NumElems);
      MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
      SmallVector<SDOperand, 8> MaskVec;
      for (unsigned i = 0; i < NumElems; i++)
        MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                   &MaskVec[0], MaskVec.size());
      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item,
                         DAG.getNode(ISD::UNDEF, VT), Mask);
    }
  }

  // A vector full of immediates; various special cases are already
  // handled, so this is best done with a single constant-pool load.
  if (NumNonZero == NumNonZeroImms)
    return SDOperand();

  // Let legalizer expand 2-wide build_vectors.
  if (EVTBits == 64)
    return SDOperand();

  // If element VT is < 32 bits, convert it to inserts into a zero vector.
  if (EVTBits == 8 && NumElems == 16) {
    SDOperand V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
                                        *this);
    if (V.Val) return V;
  }

  if (EVTBits == 16 && NumElems == 8) {
    SDOperand V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
                                        *this);
    if (V.Val) return V;
  }

  // If element VT is == 32 bits, turn it into a number of shuffles.
  SmallVector<SDOperand, 8> V;
  V.resize(NumElems);
  if (NumElems == 4 && NumZero > 0) {
    for (unsigned i = 0; i < 4; ++i) {
      bool isZero = !(NonZeros & (1 << i));
      if (isZero)
        V[i] = getZeroVector(VT, DAG);
      else
        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
    }

    for (unsigned i = 0; i < 2; ++i) {
      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
        default: break;
        case 0:
          V[i] = V[i*2];  // Must be a zero vector.
          break;
        case 1:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2],
                             getMOVLMask(NumElems, DAG));
          break;
        case 2:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
                             getMOVLMask(NumElems, DAG));
          break;
        case 3:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
                             getUnpacklMask(NumElems, DAG));
          break;
      }
    }

    // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
    // clears the upper bits.
    // FIXME: we can do the same for v4f32 case when we know both parts of
    // the lower half come from scalar_to_vector (loadf32). We should do
    // that in post legalizer dag combiner with target specific hooks.
    if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
      return V[0];
    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
    MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
    SmallVector<SDOperand, 8> MaskVec;
    bool Reverse = (NonZeros & 0x3) == 2;
    for (unsigned i = 0; i < 2; ++i)
      if (Reverse)
        MaskVec.push_back(DAG.getConstant(1-i, EVT));
      else
        MaskVec.push_back(DAG.getConstant(i, EVT));
    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
    for (unsigned i = 0; i < 2; ++i)
      if (Reverse)
        MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
      else
        MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
    SDOperand ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                     &MaskVec[0], MaskVec.size());
    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask);
  }

  if (Values.size() > 2) {
    // Expand into a number of unpckl*.
    // e.g. for v4f32
    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
    SDOperand UnpckMask = getUnpacklMask(NumElems, DAG);
    for (unsigned i = 0; i < NumElems; ++i)
      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
    NumElems >>= 1;
    while (NumElems != 0) {
      for (unsigned i = 0; i < NumElems; ++i)
        V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems],
                           UnpckMask);
      NumElems >>= 1;
    }
    return V[0];
  }

  return SDOperand();
}

SDOperand
X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
  SDOperand V1 = Op.getOperand(0);
  SDOperand V2 = Op.getOperand(1);
  SDOperand PermMask = Op.getOperand(2);
  MVT::ValueType VT = Op.getValueType();
  unsigned NumElems = PermMask.getNumOperands();
  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
  bool V1IsSplat = false;
  bool V2IsSplat = false;

  if (isUndefShuffle(Op.Val))
    return DAG.getNode(ISD::UNDEF, VT);

  if (isZeroShuffle(Op.Val))
    return getZeroVector(VT, DAG);

  if (isIdentityMask(PermMask.Val))
    return V1;
  else if (isIdentityMask(PermMask.Val, true))
    return V2;

  if (isSplatMask(PermMask.Val)) {
    if (NumElems <= 4) return Op;
    // Promote it to a v4i32 splat.
    return PromoteSplat(Op, DAG);
  }

  if (X86::isMOVLMask(PermMask.Val))
    return (V1IsUndef) ? V2 : Op;

  if (X86::isMOVSHDUPMask(PermMask.Val) ||
      X86::isMOVSLDUPMask(PermMask.Val) ||
      X86::isMOVHLPSMask(PermMask.Val) ||
      X86::isMOVHPMask(PermMask.Val) ||
      X86::isMOVLPMask(PermMask.Val))
    return Op;

  if (ShouldXformToMOVHLPS(PermMask.Val) ||
      ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val))
    return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);

  bool Commuted = false;
  V1IsSplat = isSplatVector(V1.Val);
  V2IsSplat = isSplatVector(V2.Val);
  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    std::swap(V1IsSplat, V2IsSplat);
    std::swap(V1IsUndef, V2IsUndef);
    Commuted = true;
  }

  if (isCommutedMOVL(PermMask.Val, V2IsSplat, V2IsUndef)) {
    if (V2IsUndef) return V1;
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    if (V2IsSplat) {
      // V2 is a splat, so the mask may be malformed. That is, it may point
      // to any V2 element. The instruction selectior won't like this. Get
      // a corrected mask and commute to form a proper MOVS{S|D}.
      SDOperand NewMask = getMOVLMask(NumElems, DAG);
      if (NewMask.Val != PermMask.Val)
        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
    }
    return Op;
  }

  if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
      X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
      X86::isUNPCKLMask(PermMask.Val) ||
      X86::isUNPCKHMask(PermMask.Val))
    return Op;

  if (V2IsSplat) {
    // Normalize mask so all entries that point to V2 points to its first
    // element then try to match unpck{h|l} again. If match, return a
    // new vector_shuffle with the corrected mask.
    SDOperand NewMask = NormalizeMask(PermMask, DAG);
    if (NewMask.Val != PermMask.Val) {
      if (X86::isUNPCKLMask(PermMask.Val, true)) {
        SDOperand NewMask = getUnpacklMask(NumElems, DAG);
        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
      } else if (X86::isUNPCKHMask(PermMask.Val, true)) {
        SDOperand NewMask = getUnpackhMask(NumElems, DAG);
        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
      }
    }
  }

  // Normalize the node to match x86 shuffle ops if needed
  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.Val))
      Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);

  if (Commuted) {
    // Commute is back and try unpck* again.
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
        X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
        X86::isUNPCKLMask(PermMask.Val) ||
        X86::isUNPCKHMask(PermMask.Val))
      return Op;
  }

  // If VT is integer, try PSHUF* first, then SHUFP*.
  if (MVT::isInteger(VT)) {
    // MMX doesn't have PSHUFD; it does have PSHUFW. While it's theoretically
    // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
    if (((MVT::getSizeInBits(VT) != 64 || NumElems == 4) &&
         X86::isPSHUFDMask(PermMask.Val)) ||
        X86::isPSHUFHWMask(PermMask.Val) ||
        X86::isPSHUFLWMask(PermMask.Val)) {
      if (V2.getOpcode() != ISD::UNDEF)
        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
                           DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
      return Op;
    }

    if (X86::isSHUFPMask(PermMask.Val) &&
        MVT::getSizeInBits(VT) != 64)    // Don't do this for MMX.
      return Op;

    // Handle v8i16 shuffle high / low shuffle node pair.
    if (VT == MVT::v8i16 && isPSHUFHW_PSHUFLWMask(PermMask.Val)) {
      MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
      MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
      SmallVector<SDOperand, 8> MaskVec;
      for (unsigned i = 0; i != 4; ++i)
        MaskVec.push_back(PermMask.getOperand(i));
      for (unsigned i = 4; i != 8; ++i)
        MaskVec.push_back(DAG.getConstant(i, BaseVT));
      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                   &MaskVec[0], MaskVec.size());
      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
      MaskVec.clear();
      for (unsigned i = 0; i != 4; ++i)
        MaskVec.push_back(DAG.getConstant(i, BaseVT));
      for (unsigned i = 4; i != 8; ++i)
        MaskVec.push_back(PermMask.getOperand(i));
      Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0],MaskVec.size());
      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, Mask);
    }
  } else {
    // Floating point cases in the other order.
    if (X86::isSHUFPMask(PermMask.Val))
      return Op;
    if (X86::isPSHUFDMask(PermMask.Val) ||
        X86::isPSHUFHWMask(PermMask.Val) ||
        X86::isPSHUFLWMask(PermMask.Val)) {
      if (V2.getOpcode() != ISD::UNDEF)
        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
                           DAG.getNode(ISD::UNDEF, V1.getValueType()),PermMask);
      return Op;
    }
  }

  if (NumElems == 4 && 
      // Don't do this for MMX.
      MVT::getSizeInBits(VT) != 64) {
    MVT::ValueType MaskVT = PermMask.getValueType();
    MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
    SmallVector<std::pair<int, int>, 8> Locs;
    Locs.reserve(NumElems);
    SmallVector<SDOperand, 8> Mask1(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
    SmallVector<SDOperand, 8> Mask2(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
    unsigned NumHi = 0;
    unsigned NumLo = 0;
    // If no more than two elements come from either vector. This can be
    // implemented with two shuffles. First shuffle gather the elements.
    // The second shuffle, which takes the first shuffle as both of its
    // vector operands, put the elements into the right order.
    for (unsigned i = 0; i != NumElems; ++i) {
      SDOperand Elt = PermMask.getOperand(i);
      if (Elt.getOpcode() == ISD::UNDEF) {
        Locs[i] = std::make_pair(-1, -1);
      } else {
        unsigned Val = cast<ConstantSDNode>(Elt)->getValue();
        if (Val < NumElems) {
          Locs[i] = std::make_pair(0, NumLo);
          Mask1[NumLo] = Elt;
          NumLo++;
        } else {
          Locs[i] = std::make_pair(1, NumHi);
          if (2+NumHi < NumElems)
            Mask1[2+NumHi] = Elt;
          NumHi++;
        }
      }
    }
    if (NumLo <= 2 && NumHi <= 2) {
      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                   &Mask1[0], Mask1.size()));
      for (unsigned i = 0; i != NumElems; ++i) {
        if (Locs[i].first == -1)
          continue;
        else {
          unsigned Idx = (i < NumElems/2) ? 0 : NumElems;
          Idx += Locs[i].first * (NumElems/2) + Locs[i].second;
          Mask2[i] = DAG.getConstant(Idx, MaskEVT);
        }
      }

      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1,
                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                     &Mask2[0], Mask2.size()));
    }

    // Break it into (shuffle shuffle_hi, shuffle_lo).
    Locs.clear();
    SmallVector<SDOperand,8> LoMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
    SmallVector<SDOperand,8> HiMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
    SmallVector<SDOperand,8> *MaskPtr = &LoMask;
    unsigned MaskIdx = 0;
    unsigned LoIdx = 0;
    unsigned HiIdx = NumElems/2;
    for (unsigned i = 0; i != NumElems; ++i) {
      if (i == NumElems/2) {
        MaskPtr = &HiMask;
        MaskIdx = 1;
        LoIdx = 0;
        HiIdx = NumElems/2;
      }
      SDOperand Elt = PermMask.getOperand(i);
      if (Elt.getOpcode() == ISD::UNDEF) {
        Locs[i] = std::make_pair(-1, -1);
      } else if (cast<ConstantSDNode>(Elt)->getValue() < NumElems) {
        Locs[i] = std::make_pair(MaskIdx, LoIdx);
        (*MaskPtr)[LoIdx] = Elt;
        LoIdx++;
      } else {
        Locs[i] = std::make_pair(MaskIdx, HiIdx);
        (*MaskPtr)[HiIdx] = Elt;
        HiIdx++;
      }
    }

    SDOperand LoShuffle =
      DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
                  DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                              &LoMask[0], LoMask.size()));
    SDOperand HiShuffle =
      DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
                  DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                              &HiMask[0], HiMask.size()));
    SmallVector<SDOperand, 8> MaskOps;
    for (unsigned i = 0; i != NumElems; ++i) {
      if (Locs[i].first == -1) {
        MaskOps.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
      } else {
        unsigned Idx = Locs[i].first * NumElems + Locs[i].second;
        MaskOps.push_back(DAG.getConstant(Idx, MaskEVT));
      }
    }
    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, LoShuffle, HiShuffle,
                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                   &MaskOps[0], MaskOps.size()));
  }

  return SDOperand();
}

SDOperand
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
  if (!isa<ConstantSDNode>(Op.getOperand(1)))
    return SDOperand();

  MVT::ValueType VT = Op.getValueType();
  // TODO: handle v16i8.
  if (MVT::getSizeInBits(VT) == 16) {
    // Transform it so it match pextrw which produces a 32-bit result.
    MVT::ValueType EVT = (MVT::ValueType)(VT+1);
    SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
                                    Op.getOperand(0), Op.getOperand(1));
    SDOperand Assert  = DAG.getNode(ISD::AssertZext, EVT, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
  } else if (MVT::getSizeInBits(VT) == 32) {
    SDOperand Vec = Op.getOperand(0);
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
    if (Idx == 0)
      return Op;
    // SHUFPS the element to the lowest double word, then movss.
    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
    SmallVector<SDOperand, 8> IdxVec;
    IdxVec.
      push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT)));
    IdxVec.
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    IdxVec.
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    IdxVec.
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                 &IdxVec[0], IdxVec.size());
    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
                       DAG.getConstant(0, getPointerTy()));
  } else if (MVT::getSizeInBits(VT) == 64) {
    SDOperand Vec = Op.getOperand(0);
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
    if (Idx == 0)
      return Op;

    // UNPCKHPD the element to the lowest double word, then movsd.
    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
    SmallVector<SDOperand, 8> IdxVec;
    IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT)));
    IdxVec.
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                 &IdxVec[0], IdxVec.size());
    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
                       DAG.getConstant(0, getPointerTy()));
  }

  return SDOperand();
}

SDOperand
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
  // Transform it so it match pinsrw which expects a 16-bit value in a GR32
  // as its second argument.
  MVT::ValueType VT = Op.getValueType();
  MVT::ValueType BaseVT = MVT::getVectorElementType(VT);
  SDOperand N0 = Op.getOperand(0);
  SDOperand N1 = Op.getOperand(1);
  SDOperand N2 = Op.getOperand(2);
  if (MVT::getSizeInBits(BaseVT) == 16) {
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getConstant(cast<ConstantSDNode>(N2)->getValue(),getPointerTy());
    return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2);
  } else if (MVT::getSizeInBits(BaseVT) == 32) {
    unsigned Idx = cast<ConstantSDNode>(N2)->getValue();
    if (Idx == 0) {
      // Use a movss.
      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, N1);
      MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
      MVT::ValueType BaseVT = MVT::getVectorElementType(MaskVT);
      SmallVector<SDOperand, 8> MaskVec;
      MaskVec.push_back(DAG.getConstant(4, BaseVT));
      for (unsigned i = 1; i <= 3; ++i)
        MaskVec.push_back(DAG.getConstant(i, BaseVT));
      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, N0, N1,
                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                     &MaskVec[0], MaskVec.size()));
    } else {
      // Use two pinsrw instructions to insert a 32 bit value.
      Idx <<= 1;
      if (MVT::isFloatingPoint(N1.getValueType())) {
        N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v4f32, N1);
        N1 = DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, N1);
        N1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32, N1,
                         DAG.getConstant(0, getPointerTy()));
      }
      N0 = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, N0);
      N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1,
                       DAG.getConstant(Idx, getPointerTy()));
      N1 = DAG.getNode(ISD::SRL, MVT::i32, N1, DAG.getConstant(16, MVT::i8));
      N0 = DAG.getNode(X86ISD::PINSRW, MVT::v8i16, N0, N1,
                       DAG.getConstant(Idx+1, getPointerTy()));
      return DAG.getNode(ISD::BIT_CONVERT, VT, N0);
    }
  }

  return SDOperand();
}

SDOperand
X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0));
  return DAG.getNode(X86ISD::S2VEC, Op.getValueType(), AnyExt);
}

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOV32ri.
SDOperand
X86TargetLowering::LowerConstantPool(SDOperand Op, SelectionDAG &DAG) {
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  SDOperand Result = DAG.getTargetConstantPool(CP->getConstVal(),
                                               getPointerTy(),
                                               CP->getAlignment());
  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                         Result);
  }

  return Result;
}

SDOperand
X86TargetLowering::LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) {
  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  SDOperand Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                         Result);
  }
  
  // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
  // load the value at address GV, not the value of GV itself. This means that
  // the GlobalAddress must be in the base or index register of the address, not
  // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
  // The same applies for external symbols during PIC codegen
  if (Subtarget->GVRequiresExtraLoad(GV, getTargetMachine(), false))
    Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), Result, NULL, 0);

  return Result;
}

// Lower ISD::GlobalTLSAddress using the "general dynamic" model
static SDOperand
LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                              const MVT::ValueType PtrVT) {
  SDOperand InFlag;
  SDOperand Chain = DAG.getCopyToReg(DAG.getEntryNode(), X86::EBX,
                                     DAG.getNode(X86ISD::GlobalBaseReg,
                                                 PtrVT), InFlag);
  InFlag = Chain.getValue(1);

  // emit leal symbol@TLSGD(,%ebx,1), %eax
  SDVTList NodeTys = DAG.getVTList(PtrVT, MVT::Other, MVT::Flag);
  SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
                                             GA->getValueType(0),
                                             GA->getOffset());
  SDOperand Ops[] = { Chain,  TGA, InFlag };
  SDOperand Result = DAG.getNode(X86ISD::TLSADDR, NodeTys, Ops, 3);
  InFlag = Result.getValue(2);
  Chain = Result.getValue(1);

  // call ___tls_get_addr. This function receives its argument in
  // the register EAX.
  Chain = DAG.getCopyToReg(Chain, X86::EAX, Result, InFlag);
  InFlag = Chain.getValue(1);

  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  SDOperand Ops1[] = { Chain,
                      DAG.getTargetExternalSymbol("___tls_get_addr",
                                                  PtrVT),
                      DAG.getRegister(X86::EAX, PtrVT),
                      DAG.getRegister(X86::EBX, PtrVT),
                      InFlag };
  Chain = DAG.getNode(X86ISD::CALL, NodeTys, Ops1, 5);
  InFlag = Chain.getValue(1);

  return DAG.getCopyFromReg(Chain, X86::EAX, PtrVT, InFlag);
}

// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
// "local exec" model.
static SDOperand
LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                         const MVT::ValueType PtrVT) {
  // Get the Thread Pointer
  SDOperand ThreadPointer = DAG.getNode(X86ISD::THREAD_POINTER, PtrVT);
  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
  // exec)
  SDOperand TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
                                             GA->getValueType(0),
                                             GA->getOffset());
  SDOperand Offset = DAG.getNode(X86ISD::Wrapper, PtrVT, TGA);

  if (GA->getGlobal()->isDeclaration()) // initial exec TLS model
    Offset = DAG.getLoad(PtrVT, DAG.getEntryNode(), Offset, NULL, 0);

  // The address of the thread local variable is the add of the thread
  // pointer with the offset of the variable.
  return DAG.getNode(ISD::ADD, PtrVT, ThreadPointer, Offset);
}

SDOperand
X86TargetLowering::LowerGlobalTLSAddress(SDOperand Op, SelectionDAG &DAG) {
  // TODO: implement the "local dynamic" model
  // TODO: implement the "initial exec"model for pic executables
  assert(!Subtarget->is64Bit() && Subtarget->isTargetELF() &&
         "TLS not implemented for non-ELF and 64-bit targets");
  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
  // otherwise use the "Local Exec"TLS Model
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
    return LowerToTLSGeneralDynamicModel(GA, DAG, getPointerTy());
  else
    return LowerToTLSExecModel(GA, DAG, getPointerTy());
}

SDOperand
X86TargetLowering::LowerExternalSymbol(SDOperand Op, SelectionDAG &DAG) {
  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
  SDOperand Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                         Result);
  }

  return Result;
}

SDOperand X86TargetLowering::LowerJumpTable(SDOperand Op, SelectionDAG &DAG) {
  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
  SDOperand Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                         Result);
  }

  return Result;
}

SDOperand X86TargetLowering::LowerShift(SDOperand Op, SelectionDAG &DAG) {
    assert(Op.getNumOperands() == 3 && Op.getValueType() == MVT::i32 &&
           "Not an i64 shift!");
    bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
    SDOperand ShOpLo = Op.getOperand(0);
    SDOperand ShOpHi = Op.getOperand(1);
    SDOperand ShAmt  = Op.getOperand(2);
    SDOperand Tmp1 = isSRA ?
      DAG.getNode(ISD::SRA, MVT::i32, ShOpHi, DAG.getConstant(31, MVT::i8)) :
      DAG.getConstant(0, MVT::i32);

    SDOperand Tmp2, Tmp3;
    if (Op.getOpcode() == ISD::SHL_PARTS) {
      Tmp2 = DAG.getNode(X86ISD::SHLD, MVT::i32, ShOpHi, ShOpLo, ShAmt);
      Tmp3 = DAG.getNode(ISD::SHL, MVT::i32, ShOpLo, ShAmt);
    } else {
      Tmp2 = DAG.getNode(X86ISD::SHRD, MVT::i32, ShOpLo, ShOpHi, ShAmt);
      Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, MVT::i32, ShOpHi, ShAmt);
    }

    const MVT::ValueType *VTs = DAG.getNodeValueTypes(MVT::Other, MVT::Flag);
    SDOperand AndNode = DAG.getNode(ISD::AND, MVT::i8, ShAmt,
                                    DAG.getConstant(32, MVT::i8));
    SDOperand Cond = DAG.getNode(X86ISD::CMP, MVT::i32,
                                 AndNode, DAG.getConstant(0, MVT::i8));

    SDOperand Hi, Lo;
    SDOperand CC = DAG.getConstant(X86::COND_NE, MVT::i8);
    unsigned Opc = X86ISD::CMOV;
    VTs = DAG.getNodeValueTypes(MVT::i32, MVT::Flag);
    SmallVector<SDOperand, 4> Ops;
    if (Op.getOpcode() == ISD::SHL_PARTS) {
      Ops.push_back(Tmp2);
      Ops.push_back(Tmp3);
      Ops.push_back(CC);
      Ops.push_back(Cond);
      Hi = DAG.getNode(Opc, MVT::i32, &Ops[0], Ops.size());

      Ops.clear();
      Ops.push_back(Tmp3);
      Ops.push_back(Tmp1);
      Ops.push_back(CC);
      Ops.push_back(Cond);
      Lo = DAG.getNode(Opc, MVT::i32, &Ops[0], Ops.size());
    } else {
      Ops.push_back(Tmp2);
      Ops.push_back(Tmp3);
      Ops.push_back(CC);
      Ops.push_back(Cond);
      Lo = DAG.getNode(Opc, MVT::i32, &Ops[0], Ops.size());

      Ops.clear();
      Ops.push_back(Tmp3);
      Ops.push_back(Tmp1);
      Ops.push_back(CC);
      Ops.push_back(Cond);
      Hi = DAG.getNode(Opc, MVT::i32, &Ops[0], Ops.size());
    }

    VTs = DAG.getNodeValueTypes(MVT::i32, MVT::i32);
    Ops.clear();
    Ops.push_back(Lo);
    Ops.push_back(Hi);
    return DAG.getNode(ISD::MERGE_VALUES, VTs, 2, &Ops[0], Ops.size());
}

SDOperand X86TargetLowering::LowerSINT_TO_FP(SDOperand Op, SelectionDAG &DAG) {
  assert(Op.getOperand(0).getValueType() <= MVT::i64 &&
         Op.getOperand(0).getValueType() >= MVT::i16 &&
         "Unknown SINT_TO_FP to lower!");

  SDOperand Result;
  MVT::ValueType SrcVT = Op.getOperand(0).getValueType();
  unsigned Size = MVT::getSizeInBits(SrcVT)/8;
  MachineFunction &MF = DAG.getMachineFunction();
  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size);
  SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  SDOperand Chain = DAG.getStore(DAG.getEntryNode(), Op.getOperand(0),
                                 StackSlot, NULL, 0);

  // These are really Legal; caller falls through into that case.
  if (SrcVT==MVT::i32 && Op.getValueType() == MVT::f32 && X86ScalarSSEf32)
    return Result;
  if (SrcVT==MVT::i32 && Op.getValueType() == MVT::f64 && X86ScalarSSEf64)
    return Result;
  if (SrcVT==MVT::i64 && Op.getValueType() != MVT::f80 && 
      Subtarget->is64Bit())
    return Result;

  // Build the FILD
  SDVTList Tys;
  bool useSSE = (X86ScalarSSEf32 && Op.getValueType() == MVT::f32) ||
                (X86ScalarSSEf64 && Op.getValueType() == MVT::f64);
  if (useSSE)
    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
  else
    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
  SmallVector<SDOperand, 8> Ops;
  Ops.push_back(Chain);
  Ops.push_back(StackSlot);
  Ops.push_back(DAG.getValueType(SrcVT));
  Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG :X86ISD::FILD,
                       Tys, &Ops[0], Ops.size());

  if (useSSE) {
    Chain = Result.getValue(1);
    SDOperand InFlag = Result.getValue(2);

    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
    // shouldn't be necessary except that RFP cannot be live across
    // multiple blocks. When stackifier is fixed, they can be uncoupled.
    MachineFunction &MF = DAG.getMachineFunction();
    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
    SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
    Tys = DAG.getVTList(MVT::Other);
    SmallVector<SDOperand, 8> Ops;
    Ops.push_back(Chain);
    Ops.push_back(Result);
    Ops.push_back(StackSlot);
    Ops.push_back(DAG.getValueType(Op.getValueType()));
    Ops.push_back(InFlag);
    Chain = DAG.getNode(X86ISD::FST, Tys, &Ops[0], Ops.size());
    Result = DAG.getLoad(Op.getValueType(), Chain, StackSlot, NULL, 0);
  }

  return Result;
}

SDOperand X86TargetLowering::LowerFP_TO_SINT(SDOperand Op, SelectionDAG &DAG) {
  assert(Op.getValueType() <= MVT::i64 && Op.getValueType() >= MVT::i16 &&
         "Unknown FP_TO_SINT to lower!");
  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
  // stack slot.
  SDOperand Result;
  MachineFunction &MF = DAG.getMachineFunction();
  unsigned MemSize = MVT::getSizeInBits(Op.getValueType())/8;
  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
  SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());

  // These are really Legal.
  if (Op.getValueType() == MVT::i32 && 
      X86ScalarSSEf32 && Op.getOperand(0).getValueType() == MVT::f32)
    return Result;
  if (Op.getValueType() == MVT::i32 && 
      X86ScalarSSEf64 && Op.getOperand(0).getValueType() == MVT::f64)
    return Result;
  if (Subtarget->is64Bit() &&
      Op.getValueType() == MVT::i64 &&
      Op.getOperand(0).getValueType() != MVT::f80)
    return Result;

  unsigned Opc;
  switch (Op.getValueType()) {
    default: assert(0 && "Invalid FP_TO_SINT to lower!");
    case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
    case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
    case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
  }

  SDOperand Chain = DAG.getEntryNode();
  SDOperand Value = Op.getOperand(0);
  if ((X86ScalarSSEf32 && Op.getOperand(0).getValueType() == MVT::f32) ||
      (X86ScalarSSEf64 && Op.getOperand(0).getValueType() == MVT::f64)) {
    assert(Op.getValueType() == MVT::i64 && "Invalid FP_TO_SINT to lower!");
    Chain = DAG.getStore(Chain, Value, StackSlot, NULL, 0);
    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
    SDOperand Ops[] = {
      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
    };
    Value = DAG.getNode(X86ISD::FLD, Tys, Ops, 3);
    Chain = Value.getValue(1);
    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize);
    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  }

  // Build the FP_TO_INT*_IN_MEM
  SDOperand Ops[] = { Chain, Value, StackSlot };
  SDOperand FIST = DAG.getNode(Opc, MVT::Other, Ops, 3);

  // Load the result.
  return DAG.getLoad(Op.getValueType(), FIST, StackSlot, NULL, 0);
}

SDOperand X86TargetLowering::LowerFABS(SDOperand Op, SelectionDAG &DAG) {
  MVT::ValueType VT = Op.getValueType();
  MVT::ValueType EltVT = VT;
  if (MVT::isVector(VT))
    EltVT = MVT::getVectorElementType(VT);
  const Type *OpNTy =  MVT::getTypeForValueType(EltVT);
  std::vector<Constant*> CV;
  if (EltVT == MVT::f64) {
    Constant *C = ConstantFP::get(OpNTy, APFloat(APInt(64, ~(1ULL << 63))));
    CV.push_back(C);
    CV.push_back(C);
  } else {
    Constant *C = ConstantFP::get(OpNTy, APFloat(APInt(32, ~(1U << 31))));
    CV.push_back(C);
    CV.push_back(C);
    CV.push_back(C);
    CV.push_back(C);
  }
  Constant *C = ConstantVector::get(CV);
  SDOperand CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
  SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0,
                               false, 16);
  return DAG.getNode(X86ISD::FAND, VT, Op.getOperand(0), Mask);
}

SDOperand X86TargetLowering::LowerFNEG(SDOperand Op, SelectionDAG &DAG) {
  MVT::ValueType VT = Op.getValueType();
  MVT::ValueType EltVT = VT;
  unsigned EltNum = 1;
  if (MVT::isVector(VT)) {
    EltVT = MVT::getVectorElementType(VT);
    EltNum = MVT::getVectorNumElements(VT);
  }
  const Type *OpNTy =  MVT::getTypeForValueType(EltVT);
  std::vector<Constant*> CV;
  if (EltVT == MVT::f64) {
    Constant *C = ConstantFP::get(OpNTy, APFloat(APInt(64, 1ULL << 63)));
    CV.push_back(C);
    CV.push_back(C);
  } else {
    Constant *C = ConstantFP::get(OpNTy, APFloat(APInt(32, 1U << 31)));
    CV.push_back(C);
    CV.push_back(C);
    CV.push_back(C);
    CV.push_back(C);
  }
  Constant *C = ConstantVector::get(CV);
  SDOperand CPIdx = DAG.getConstantPool(C, getPointerTy(), 4);
  SDOperand Mask = DAG.getLoad(VT, DAG.getEntryNode(), CPIdx, NULL, 0,
                               false, 16);