X86ISelLowering.cpp

    for (unsigned i = 0; i != 16; ++i) {
      int EltIdx = MaskVals[i];
      if (EltIdx < 16) {
        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
        continue;
      }
      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
    }
    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                     DAG.getNode(ISD::BUILD_VECTOR, dl,
                                 MVT::v16i8, &pshufbMask[0], 16));
    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
  }

  // No SSSE3 - Calculate in place words and then fix all out of place words
  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
  // the 16 different words that comprise the two doublequadword input vectors.
  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
  SDValue NewV = V2Only ? V2 : V1;
  for (int i = 0; i != 8; ++i) {
    int Elt0 = MaskVals[i*2];
    int Elt1 = MaskVals[i*2+1];

    // This word of the result is all undef, skip it.
    if (Elt0 < 0 && Elt1 < 0)
      continue;

    // This word of the result is already in the correct place, skip it.
    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
      continue;
    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
      continue;

    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
    SDValue InsElt;

    // If Elt0 and Elt1 are defined, are consecutive, and can be load
    // using a single extract together, load it and store it.
    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
                           DAG.getIntPtrConstant(Elt1 / 2));
      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
                        DAG.getIntPtrConstant(i));
      continue;
    }

    // If Elt1 is defined, extract it from the appropriate source.  If the
    // source byte is not also odd, shift the extracted word left 8 bits
    // otherwise clear the bottom 8 bits if we need to do an or.
    if (Elt1 >= 0) {
      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
                           DAG.getIntPtrConstant(Elt1 / 2));
      if ((Elt1 & 1) == 0)
        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
                             DAG.getConstant(8, TLI.getShiftAmountTy()));
      else if (Elt0 >= 0)
        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
                             DAG.getConstant(0xFF00, MVT::i16));
    }
    // If Elt0 is defined, extract it from the appropriate source.  If the
    // source byte is not also even, shift the extracted word right 8 bits. If
    // Elt1 was also defined, OR the extracted values together before
    // inserting them in the result.
    if (Elt0 >= 0) {
      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
      if ((Elt0 & 1) != 0)
        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
                              DAG.getConstant(8, TLI.getShiftAmountTy()));
      else if (Elt1 >= 0)
        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
                             DAG.getConstant(0x00FF, MVT::i16));
      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
                         : InsElt0;
    }
    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
                       DAG.getIntPtrConstant(i));
  }
  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
}

/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
/// done when every pair / quad of shuffle mask elements point to elements in
/// the right sequence. e.g.
/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
                                 SelectionDAG &DAG,
                                 TargetLowering &TLI, DebugLoc dl) {
  EVT VT = SVOp->getValueType(0);
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  unsigned NumElems = VT.getVectorNumElements();
  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
  EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
  EVT MaskEltVT = MaskVT.getVectorElementType();
  EVT NewVT = MaskVT;
  switch (VT.getSimpleVT().SimpleTy) {
  default: assert(false && "Unexpected!");
  case MVT::v4f32: NewVT = MVT::v2f64; break;
  case MVT::v4i32: NewVT = MVT::v2i64; break;
  case MVT::v8i16: NewVT = MVT::v4i32; break;
  case MVT::v16i8: NewVT = MVT::v4i32; break;
  }

  if (NewWidth == 2) {
    if (VT.isInteger())
      NewVT = MVT::v2i64;
    else
      NewVT = MVT::v2f64;
  }
  int Scale = NumElems / NewWidth;
  SmallVector<int, 8> MaskVec;
  for (unsigned i = 0; i < NumElems; i += Scale) {
    int StartIdx = -1;
    for (int j = 0; j < Scale; ++j) {
      int EltIdx = SVOp->getMaskElt(i+j);
      if (EltIdx < 0)
        continue;
      if (StartIdx == -1)
        StartIdx = EltIdx - (EltIdx % Scale);
      if (EltIdx != StartIdx + j)
        return SDValue();
    }
    if (StartIdx == -1)
      MaskVec.push_back(-1);
    else
      MaskVec.push_back(StartIdx / Scale);
  }

  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
}

/// getVZextMovL - Return a zero-extending vector move low node.
///
static SDValue getVZextMovL(EVT VT, EVT OpVT,
                            SDValue SrcOp, SelectionDAG &DAG,
                            const X86Subtarget *Subtarget, DebugLoc dl) {
  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
    LoadSDNode *LD = NULL;
    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
      LD = dyn_cast<LoadSDNode>(SrcOp);
    if (!LD) {
      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
      // instead.
      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
      if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
        // PR2108
        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                   OpVT,
                                                   SrcOp.getOperand(0)
                                                          .getOperand(0))));
      }
    }
  }

  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
                                 DAG.getNode(ISD::BIT_CONVERT, dl,
                                             OpVT, SrcOp)));
}

/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
/// shuffles.
static SDValue
LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  DebugLoc dl = SVOp->getDebugLoc();
  EVT VT = SVOp->getValueType(0);

  SmallVector<std::pair<int, int>, 8> Locs;
  Locs.resize(4);
  SmallVector<int, 8> Mask1(4U, -1);
  SmallVector<int, 8> PermMask;
  SVOp->getMask(PermMask);

  unsigned NumHi = 0;
  unsigned NumLo = 0;
  for (unsigned i = 0; i != 4; ++i) {
    int Idx = PermMask[i];
    if (Idx < 0) {
      Locs[i] = std::make_pair(-1, -1);
    } else {
      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
      if (Idx < 4) {
        Locs[i] = std::make_pair(0, NumLo);
        Mask1[NumLo] = Idx;
        NumLo++;
      } else {
        Locs[i] = std::make_pair(1, NumHi);
        if (2+NumHi < 4)
          Mask1[2+NumHi] = Idx;
        NumHi++;
      }
    }
  }

  if (NumLo <= 2 && NumHi <= 2) {
    // If no more than two elements come from either vector. This can be
    // implemented with two shuffles. First shuffle gather the elements.
    // The second shuffle, which takes the first shuffle as both of its
    // vector operands, put the elements into the right order.
    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);

    SmallVector<int, 8> Mask2(4U, -1);

    for (unsigned i = 0; i != 4; ++i) {
      if (Locs[i].first == -1)
        continue;
      else {
        unsigned Idx = (i < 2) ? 0 : 4;
        Idx += Locs[i].first * 2 + Locs[i].second;
        Mask2[i] = Idx;
      }
    }

    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
  } else if (NumLo == 3 || NumHi == 3) {
    // Otherwise, we must have three elements from one vector, call it X, and
    // one element from the other, call it Y.  First, use a shufps to build an
    // intermediate vector with the one element from Y and the element from X
    // that will be in the same half in the final destination (the indexes don't
    // matter). Then, use a shufps to build the final vector, taking the half
    // containing the element from Y from the intermediate, and the other half
    // from X.
    if (NumHi == 3) {
      // Normalize it so the 3 elements come from V1.
      CommuteVectorShuffleMask(PermMask, VT);
      std::swap(V1, V2);
    }

    // Find the element from V2.
    unsigned HiIndex;
    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
      int Val = PermMask[HiIndex];
      if (Val < 0)
        continue;
      if (Val >= 4)
        break;
    }

    Mask1[0] = PermMask[HiIndex];
    Mask1[1] = -1;
    Mask1[2] = PermMask[HiIndex^1];
    Mask1[3] = -1;
    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);

    if (HiIndex >= 2) {
      Mask1[0] = PermMask[0];
      Mask1[1] = PermMask[1];
      Mask1[2] = HiIndex & 1 ? 6 : 4;
      Mask1[3] = HiIndex & 1 ? 4 : 6;
      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
    } else {
      Mask1[0] = HiIndex & 1 ? 2 : 0;
      Mask1[1] = HiIndex & 1 ? 0 : 2;
      Mask1[2] = PermMask[2];
      Mask1[3] = PermMask[3];
      if (Mask1[2] >= 0)
        Mask1[2] += 4;
      if (Mask1[3] >= 0)
        Mask1[3] += 4;
      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
    }
  }

  // Break it into (shuffle shuffle_hi, shuffle_lo).
  Locs.clear();
  SmallVector<int,8> LoMask(4U, -1);
  SmallVector<int,8> HiMask(4U, -1);

  SmallVector<int,8> *MaskPtr = &LoMask;
  unsigned MaskIdx = 0;
  unsigned LoIdx = 0;
  unsigned HiIdx = 2;
  for (unsigned i = 0; i != 4; ++i) {
    if (i == 2) {
      MaskPtr = &HiMask;
      MaskIdx = 1;
      LoIdx = 0;
      HiIdx = 2;
    }
    int Idx = PermMask[i];
    if (Idx < 0) {
      Locs[i] = std::make_pair(-1, -1);
    } else if (Idx < 4) {
      Locs[i] = std::make_pair(MaskIdx, LoIdx);
      (*MaskPtr)[LoIdx] = Idx;
      LoIdx++;
    } else {
      Locs[i] = std::make_pair(MaskIdx, HiIdx);
      (*MaskPtr)[HiIdx] = Idx;
      HiIdx++;
    }
  }

  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
  SmallVector<int, 8> MaskOps;
  for (unsigned i = 0; i != 4; ++i) {
    if (Locs[i].first == -1) {
      MaskOps.push_back(-1);
    } else {
      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
      MaskOps.push_back(Idx);
    }
  }
  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
}

SDValue
X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
  SDValue V1 = Op.getOperand(0);
  SDValue V2 = Op.getOperand(1);
  EVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  unsigned NumElems = VT.getVectorNumElements();
  bool isMMX = VT.getSizeInBits() == 64;
  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
  bool V1IsSplat = false;
  bool V2IsSplat = false;

  if (isZeroShuffle(SVOp))
    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);

  // Promote splats to v4f32.
  if (SVOp->isSplat()) {
    if (isMMX || NumElems < 4)
      return Op;
    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
  }

  // If the shuffle can be profitably rewritten as a narrower shuffle, then
  // do it!
  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
    if (NewOp.getNode())
      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
                         LowerVECTOR_SHUFFLE(NewOp, DAG));
  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
    // FIXME: Figure out a cleaner way to do this.
    // Try to make use of movq to zero out the top part.
    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
      if (NewOp.getNode()) {
        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
                              DAG, Subtarget, dl);
      }
    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
                            DAG, Subtarget, dl);
    }
  }

  if (X86::isPSHUFDMask(SVOp))
    return Op;

  // Check if this can be converted into a logical shift.
  bool isLeft = false;
  unsigned ShAmt = 0;
  SDValue ShVal;
  bool isShift = getSubtarget()->hasSSE2() &&
    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
  if (isShift && ShVal.hasOneUse()) {
    // If the shifted value has multiple uses, it may be cheaper to use
    // v_set0 + movlhps or movhlps, etc.
    EVT EltVT = VT.getVectorElementType();
    ShAmt *= EltVT.getSizeInBits();
    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
  }

  if (X86::isMOVLMask(SVOp)) {
    if (V1IsUndef)
      return V2;
    if (ISD::isBuildVectorAllZeros(V1.getNode()))
      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
    if (!isMMX)
      return Op;
  }

  // FIXME: fold these into legal mask.
  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
                 X86::isMOVSLDUPMask(SVOp) ||
                 X86::isMOVHLPSMask(SVOp) ||
                 X86::isMOVLHPSMask(SVOp) ||
                 X86::isMOVLPMask(SVOp)))
    return Op;

  if (ShouldXformToMOVHLPS(SVOp) ||
      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
    return CommuteVectorShuffle(SVOp, DAG);

  if (isShift) {
    // No better options. Use a vshl / vsrl.
    EVT EltVT = VT.getVectorElementType();
    ShAmt *= EltVT.getSizeInBits();
    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
  }

  bool Commuted = false;
  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
  // 1,1,1,1 -> v8i16 though.
  V1IsSplat = isSplatVector(V1.getNode());
  V2IsSplat = isSplatVector(V2.getNode());

  // Canonicalize the splat or undef, if present, to be on the RHS.
  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
    Op = CommuteVectorShuffle(SVOp, DAG);
    SVOp = cast<ShuffleVectorSDNode>(Op);
    V1 = SVOp->getOperand(0);
    V2 = SVOp->getOperand(1);
    std::swap(V1IsSplat, V2IsSplat);
    std::swap(V1IsUndef, V2IsUndef);
    Commuted = true;
  }

  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
    // Shuffling low element of v1 into undef, just return v1.
    if (V2IsUndef)
      return V1;
    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
    // the instruction selector will not match, so get a canonical MOVL with
    // swapped operands to undo the commute.
    return getMOVL(DAG, dl, VT, V2, V1);
  }

  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
      X86::isUNPCKH_v_undef_Mask(SVOp) ||
      X86::isUNPCKLMask(SVOp) ||
      X86::isUNPCKHMask(SVOp))
    return Op;

  if (V2IsSplat) {
    // Normalize mask so all entries that point to V2 points to its first
    // element then try to match unpck{h|l} again. If match, return a
    // new vector_shuffle with the corrected mask.
    SDValue NewMask = NormalizeMask(SVOp, DAG);
    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
    if (NSVOp != SVOp) {
      if (X86::isUNPCKLMask(NSVOp, true)) {
        return NewMask;
      } else if (X86::isUNPCKHMask(NSVOp, true)) {
        return NewMask;
      }
    }
  }

  if (Commuted) {
    // Commute is back and try unpck* again.
    // FIXME: this seems wrong.
    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
        X86::isUNPCKLMask(NewSVOp) ||
        X86::isUNPCKHMask(NewSVOp))
      return NewOp;
  }

  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.

  // Normalize the node to match x86 shuffle ops if needed
  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
    return CommuteVectorShuffle(SVOp, DAG);

  // Check for legal shuffle and return?
  SmallVector<int, 16> PermMask;
  SVOp->getMask(PermMask);
  if (isShuffleMaskLegal(PermMask, VT))
    return Op;

  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
  if (VT == MVT::v8i16) {
    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
    if (NewOp.getNode())
      return NewOp;
  }

  if (VT == MVT::v16i8) {
    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
    if (NewOp.getNode())
      return NewOp;
  }

  // Handle all 4 wide cases with a number of shuffles except for MMX.
  if (NumElems == 4 && !isMMX)
    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);

  return SDValue();
}

SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                SelectionDAG &DAG) {
  EVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  if (VT.getSizeInBits() == 8) {
    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT.getSizeInBits() == 16) {
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
    if (Idx == 0)
      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                     DAG.getNode(ISD::BIT_CONVERT, dl,
                                                 MVT::v4i32,
                                                 Op.getOperand(0)),
                                     Op.getOperand(1)));
    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT == MVT::f32) {
    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
    // the result back to FR32 register. It's only worth matching if the
    // result has a single use which is a store or a bitcast to i32.  And in
    // the case of a store, it's not worth it if the index is a constant 0,
    // because a MOVSSmr can be used instead, which is smaller and faster.
    if (!Op.hasOneUse())
      return SDValue();
    SDNode *User = *Op.getNode()->use_begin();
    if ((User->getOpcode() != ISD::STORE ||
         (isa<ConstantSDNode>(Op.getOperand(1)) &&
          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
        (User->getOpcode() != ISD::BIT_CONVERT ||
         User->getValueType(0) != MVT::i32))
      return SDValue();
    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
                                              Op.getOperand(0)),
                                              Op.getOperand(1));
    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
  } else if (VT == MVT::i32) {
    // ExtractPS works with constant index.
    if (isa<ConstantSDNode>(Op.getOperand(1)))
      return Op;
  }
  return SDValue();
}


SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
  if (!isa<ConstantSDNode>(Op.getOperand(1)))
    return SDValue();

  if (Subtarget->hasSSE41()) {
    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
    if (Res.getNode())
      return Res;
  }

  EVT VT = Op.getValueType();
  DebugLoc dl = Op.getDebugLoc();
  // TODO: handle v16i8.
  if (VT.getSizeInBits() == 16) {
    SDValue Vec = Op.getOperand(0);
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                     DAG.getNode(ISD::BIT_CONVERT, dl,
                                                 MVT::v4i32, Vec),
                                     Op.getOperand(1)));
    // Transform it so it match pextrw which produces a 32-bit result.
    EVT EltVT = MVT::i32;
    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
                                    Op.getOperand(0), Op.getOperand(1));
    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
  } else if (VT.getSizeInBits() == 32) {
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return Op;

    // SHUFPS the element to the lowest double word, then movss.
    int Mask[4] = { Idx, -1, -1, -1 };
    EVT VVT = Op.getOperand(0).getValueType();
    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                       DAG.getUNDEF(VVT), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                       DAG.getIntPtrConstant(0));
  } else if (VT.getSizeInBits() == 64) {
    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
    //        to match extract_elt for f64.
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
    if (Idx == 0)
      return Op;

    // UNPCKHPD the element to the lowest double word, then movsd.
    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
    int Mask[2] = { 1, -1 };
    EVT VVT = Op.getOperand(0).getValueType();
    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
                                       DAG.getUNDEF(VVT), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
                       DAG.getIntPtrConstant(0));
  }

  return SDValue();
}

SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
  EVT VT = Op.getValueType();
  EVT EltVT = VT.getVectorElementType();
  DebugLoc dl = Op.getDebugLoc();

  SDValue N0 = Op.getOperand(0);
  SDValue N1 = Op.getOperand(1);
  SDValue N2 = Op.getOperand(2);

  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
      isa<ConstantSDNode>(N2)) {
    unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB
                                                : X86ISD::PINSRW;
    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
    // argument.
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
    // Bits [7:6] of the constant are the source select.  This will always be
    //  zero here.  The DAG Combiner may combine an extract_elt index into these
    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
    // Bits [5:4] of the constant are the destination select.  This is the
    //  value of the incoming immediate.
    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
    //   combine either bitwise AND or insert of float 0.0 to set these bits.
    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
    // Create this as a scalar to vector..
    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
    // PINSR* works with constant index.
    return Op;
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
  EVT VT = Op.getValueType();
  EVT EltVT = VT.getVectorElementType();

  if (Subtarget->hasSSE41())
    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);

  if (EltVT == MVT::i8)
    return SDValue();

  DebugLoc dl = Op.getDebugLoc();
  SDValue N0 = Op.getOperand(0);
  SDValue N1 = Op.getOperand(1);
  SDValue N2 = Op.getOperand(2);

  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
    // as its second argument.
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
  }
  return SDValue();
}

SDValue
X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
  DebugLoc dl = Op.getDebugLoc();
  if (Op.getValueType() == MVT::v2f32)
    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
                                               Op.getOperand(0))));

  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));

  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
  EVT VT = MVT::v2i32;
  switch (Op.getValueType().getSimpleVT().SimpleTy) {
  default: break;
  case MVT::v16i8:
  case MVT::v8i16:
    VT = MVT::v4i32;
    break;
  }
  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
}

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOV32ri.
SDValue
X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
  // global base reg.
  unsigned char OpFlag = 0;
  unsigned WrapperKind = X86ISD::Wrapper;
  CodeModel::Model M = getTargetMachine().getCodeModel();

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    WrapperKind = X86ISD::WrapperRIP;
  else if (Subtarget->isPICStyleGOT())
    OpFlag = X86II::MO_GOTOFF;
  else if (Subtarget->isPICStyleStubPIC())
    OpFlag = X86II::MO_PIC_BASE_OFFSET;

  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
                                             CP->getAlignment(),
                                             CP->getOffset(), OpFlag);
  DebugLoc DL = CP->getDebugLoc();
  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (OpFlag) {
    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc::getUnknownLoc(), getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
  // global base reg.
  unsigned char OpFlag = 0;
  unsigned WrapperKind = X86ISD::Wrapper;
  CodeModel::Model M = getTargetMachine().getCodeModel();

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    WrapperKind = X86ISD::WrapperRIP;
  else if (Subtarget->isPICStyleGOT())
    OpFlag = X86II::MO_GOTOFF;
  else if (Subtarget->isPICStyleStubPIC())
    OpFlag = X86II::MO_PIC_BASE_OFFSET;

  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
                                          OpFlag);
  DebugLoc DL = JT->getDebugLoc();
  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);

  // With PIC, the address is actually $g + Offset.
  if (OpFlag) {
    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc::getUnknownLoc(), getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue
X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
  // global base reg.
  unsigned char OpFlag = 0;
  unsigned WrapperKind = X86ISD::Wrapper;
  CodeModel::Model M = getTargetMachine().getCodeModel();

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    WrapperKind = X86ISD::WrapperRIP;
  else if (Subtarget->isPICStyleGOT())
    OpFlag = X86II::MO_GOTOFF;
  else if (Subtarget->isPICStyleStubPIC())
    OpFlag = X86II::MO_PIC_BASE_OFFSET;

  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);

  DebugLoc DL = Op.getDebugLoc();
  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);


  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->is64Bit()) {
    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg,
                                     DebugLoc::getUnknownLoc(),
                                     getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue
X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) {
  // Create the TargetBlockAddressAddress node.
  unsigned char OpFlags =
    Subtarget->ClassifyBlockAddressReference();
  CodeModel::Model M = getTargetMachine().getCodeModel();
  BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
  DebugLoc dl = Op.getDebugLoc();
  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
                                       /*isTarget=*/true, OpFlags);

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
  else
    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);

  // With PIC, the address is actually $g + Offset.
  if (isGlobalRelativeToPICBase(OpFlags)) {
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
                         Result);
  }

  return Result;
}

SDValue
X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
                                      int64_t Offset,
                                      SelectionDAG &DAG) const {
  // Create the TargetGlobalAddress node, folding in the constant
  // offset if it is legal.
  unsigned char OpFlags =
    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
  CodeModel::Model M = getTargetMachine().getCodeModel();
  SDValue Result;
  if (OpFlags == X86II::MO_NO_FLAG &&
      X86::isOffsetSuitableForCodeModel(Offset, M)) {
    // A direct static reference to a global.
    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
    Offset = 0;
  } else {
    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
  }

  if (Subtarget->isPICStyleRIPRel() &&
      (M == CodeModel::Small || M == CodeModel::Kernel))
    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
  else
    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);

  // With PIC, the address is actually $g + Offset.
  if (isGlobalRelativeToPICBase(OpFlags)) {
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
                         Result);
  }

  // For globals that require a load from a stub to get the address, emit the
  // load.
  if (isGlobalStubReference(OpFlags))
    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
                         PseudoSourceValue::getGOT(), 0);

  // If there was a non-zero offset that we didn't fold, create an explicit
  // addition for it.
  if (Offset != 0)
    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
                         DAG.getConstant(Offset, getPointerTy()));

  return Result;
}

SDValue
X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
}

static SDValue
GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
           unsigned char OperandFlags) {
  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  DebugLoc dl = GA->getDebugLoc();
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
                                           GA->getValueType(0),
                                           GA->getOffset(),
                                           OperandFlags);
  if (InFlag) {
    SDValue Ops[] = { Chain,  TGA, *InFlag };
    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
  } else {
    SDValue Ops[]  = { Chain, TGA };
    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
  }

  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
  MFI->setHasCalls(true);

  SDValue Flag = Chain.getValue(1);
  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
}

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
static SDValue
LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                const EVT PtrVT) {
  SDValue InFlag;
  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
                                     DAG.getNode(X86ISD::GlobalBaseReg,
                                                 DebugLoc::getUnknownLoc(),
                                                 PtrVT), InFlag);
  InFlag = Chain.getValue(1);

  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
}

// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
static SDValue
LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                const EVT PtrVT) {
  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
                    X86::RAX, X86II::MO_TLSGD);
}

// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
// "local exec" model.
static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                   const EVT PtrVT, TLSModel::Model model,
                                   bool is64Bit) {
  DebugLoc dl = GA->getDebugLoc();
  // Get the Thread Pointer
  SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
                             DebugLoc::getUnknownLoc(), PtrVT,
                             DAG.getRegister(is64Bit? X86::FS : X86::GS,
                                             MVT::i32));

  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
                                      NULL, 0);

  unsigned char OperandFlags = 0;
  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
  // initialexec.
  unsigned WrapperKind = X86ISD::Wrapper;
  if (model == TLSModel::LocalExec) {
    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
  } else if (is64Bit) {
    assert(model == TLSModel::InitialExec);
    OperandFlags = X86II::MO_GOTTPOFF;
    WrapperKind = X86ISD::WrapperRIP;
  } else {
    assert(model == TLSModel::InitialExec);
    OperandFlags = X86II::MO_INDNTPOFF;
  }

  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
  // exec)
  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
                                           GA->getOffset(), OperandFlags);
  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

  if (model == TLSModel::InitialExec)
    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
                         PseudoSourceValue::getGOT(), 0);