X86ISelLowering.cpp

  // Special case for single non-zero, non-undef, element.
  if (NumNonZero == 1 && NumElems <= 4) {
    unsigned Idx = CountTrailingZeros_32(NonZeros);
    SDOperand Item = Op.getOperand(Idx);
    
    // If this is an insertion of an i64 value on x86-32, and if the top bits of
    // the value are obviously zero, truncate the value to i32 and do the
    // insertion that way.  Only do this if the value is non-constant or if the
    // value is a constant being inserted into element 0.  It is cheaper to do
    // a constant pool load than it is to do a movd + shuffle.
    if (EVT == MVT::i64 && !Subtarget->is64Bit() &&
        (!IsAllConstants || Idx == 0)) {
      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
        // Handle MMX and SSE both.
        MVT::ValueType VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
        MVT::ValueType VecElts = VT == MVT::v2i64 ? 4 : 2;
        
        // Truncate the value (which may itself be a constant) to i32, and
        // convert it to a vector with movd (S2V+shuffle to zero extend).
        Item = DAG.getNode(ISD::TRUNCATE, MVT::i32, Item);
        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VecVT, Item);
        Item = getShuffleVectorZeroOrUndef(Item, 0, true, DAG);
        
        // Now we have our 32-bit value zero extended in the low element of
        // a vector.  If Idx != 0, swizzle it into place.
        if (Idx != 0) {
          SDOperand Ops[] = { 
            Item, DAG.getNode(ISD::UNDEF, Item.getValueType()),
            getSwapEltZeroMask(VecElts, Idx, DAG)
          };
          Item = DAG.getNode(ISD::VECTOR_SHUFFLE, VecVT, Ops, 3);
        }
        return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(), Item);
      }
    }
    
    // If we have a constant or non-constant insertion into the low element of
    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
    // depending on what the source datatype is.  Because we can only get here
    // when NumElems <= 4, this only needs to handle i32/f32/i64/f64.
    if (Idx == 0 &&
        // Don't do this for i64 values on x86-32.
        (EVT != MVT::i64 || Subtarget->is64Bit())) {
      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
      // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
      return getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, DAG);
    }
    
    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
      return SDOperand();

    // Otherwise, if this is a vector with i32 or f32 elements, and the element
    // is a non-constant being inserted into an element other than the low one,
    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
    // movd/movss) to move this into the low element, then shuffle it into
    // place.
    if (EVTBits == 32) {
      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Item);
      
      // Turn it into a shuffle of zero and zero-extended scalar to vector.
      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, DAG);
      MVT::ValueType MaskVT  = MVT::getIntVectorWithNumElements(NumElems);
      MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
      SmallVector<SDOperand, 8> MaskVec;
      for (unsigned i = 0; i < NumElems; i++)
        MaskVec.push_back(DAG.getConstant((i == Idx) ? 0 : 1, MaskEVT));
      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                   &MaskVec[0], MaskVec.size());
      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, Item,
                         DAG.getNode(ISD::UNDEF, VT), Mask);
    }
  }

  // Splat is obviously ok. Let legalizer expand it to a shuffle.
  if (Values.size() == 1)
    return SDOperand();
  
  // A vector full of immediates; various special cases are already
  // handled, so this is best done with a single constant-pool load.
  if (IsAllConstants)
    return SDOperand();

  // Let legalizer expand 2-wide build_vectors.
  if (EVTBits == 64)
    return SDOperand();

  // If element VT is < 32 bits, convert it to inserts into a zero vector.
  if (EVTBits == 8 && NumElems == 16) {
    SDOperand V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
                                        *this);
    if (V.Val) return V;
  }

  if (EVTBits == 16 && NumElems == 8) {
    SDOperand V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
                                        *this);
    if (V.Val) return V;
  }

  // If element VT is == 32 bits, turn it into a number of shuffles.
  SmallVector<SDOperand, 8> V;
  V.resize(NumElems);
  if (NumElems == 4 && NumZero > 0) {
    for (unsigned i = 0; i < 4; ++i) {
      bool isZero = !(NonZeros & (1 << i));
      if (isZero)
        V[i] = getZeroVector(VT, DAG);
      else
        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
    }

    for (unsigned i = 0; i < 2; ++i) {
      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
        default: break;
        case 0:
          V[i] = V[i*2];  // Must be a zero vector.
          break;
        case 1:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2+1], V[i*2],
                             getMOVLMask(NumElems, DAG));
          break;
        case 2:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
                             getMOVLMask(NumElems, DAG));
          break;
        case 3:
          V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i*2], V[i*2+1],
                             getUnpacklMask(NumElems, DAG));
          break;
      }
    }

    // Take advantage of the fact GR32 to VR128 scalar_to_vector (i.e. movd)
    // clears the upper bits.
    // FIXME: we can do the same for v4f32 case when we know both parts of
    // the lower half come from scalar_to_vector (loadf32). We should do
    // that in post legalizer dag combiner with target specific hooks.
    if (MVT::isInteger(EVT) && (NonZeros & (0x3 << 2)) == 0)
      return V[0];
    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NumElems);
    MVT::ValueType EVT = MVT::getVectorElementType(MaskVT);
    SmallVector<SDOperand, 8> MaskVec;
    bool Reverse = (NonZeros & 0x3) == 2;
    for (unsigned i = 0; i < 2; ++i)
      if (Reverse)
        MaskVec.push_back(DAG.getConstant(1-i, EVT));
      else
        MaskVec.push_back(DAG.getConstant(i, EVT));
    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
    for (unsigned i = 0; i < 2; ++i)
      if (Reverse)
        MaskVec.push_back(DAG.getConstant(1-i+NumElems, EVT));
      else
        MaskVec.push_back(DAG.getConstant(i+NumElems, EVT));
    SDOperand ShufMask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                     &MaskVec[0], MaskVec.size());
    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[0], V[1], ShufMask);
  }

  if (Values.size() > 2) {
    // Expand into a number of unpckl*.
    // e.g. for v4f32
    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
    SDOperand UnpckMask = getUnpacklMask(NumElems, DAG);
    for (unsigned i = 0; i < NumElems; ++i)
      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, Op.getOperand(i));
    NumElems >>= 1;
    while (NumElems != 0) {
      for (unsigned i = 0; i < NumElems; ++i)
        V[i] = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V[i], V[i + NumElems],
                           UnpckMask);
      NumElems >>= 1;
    }
    return V[0];
  }

  return SDOperand();
}

static
SDOperand LowerVECTOR_SHUFFLEv8i16(SDOperand V1, SDOperand V2,
                                   SDOperand PermMask, SelectionDAG &DAG,
                                   TargetLowering &TLI) {
  SDOperand NewV;
  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(8);
  MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
  MVT::ValueType PtrVT = TLI.getPointerTy();
  SmallVector<SDOperand, 8> MaskElts(PermMask.Val->op_begin(),
                                     PermMask.Val->op_end());

  // First record which half of which vector the low elements come from.
  SmallVector<unsigned, 4> LowQuad(4);
  for (unsigned i = 0; i < 4; ++i) {
    SDOperand Elt = MaskElts[i];
    if (Elt.getOpcode() == ISD::UNDEF)
      continue;
    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
    int QuadIdx = EltIdx / 4;
    ++LowQuad[QuadIdx];
  }
  int BestLowQuad = -1;
  unsigned MaxQuad = 1;
  for (unsigned i = 0; i < 4; ++i) {
    if (LowQuad[i] > MaxQuad) {
      BestLowQuad = i;
      MaxQuad = LowQuad[i];
    }
  }

  // Record which half of which vector the high elements come from.
  SmallVector<unsigned, 4> HighQuad(4);
  for (unsigned i = 4; i < 8; ++i) {
    SDOperand Elt = MaskElts[i];
    if (Elt.getOpcode() == ISD::UNDEF)
      continue;
    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
    int QuadIdx = EltIdx / 4;
    ++HighQuad[QuadIdx];
  }
  int BestHighQuad = -1;
  MaxQuad = 1;
  for (unsigned i = 0; i < 4; ++i) {
    if (HighQuad[i] > MaxQuad) {
      BestHighQuad = i;
      MaxQuad = HighQuad[i];
    }
  }

  // If it's possible to sort parts of either half with PSHUF{H|L}W, then do it.
  if (BestLowQuad != -1 || BestHighQuad != -1) {
    // First sort the 4 chunks in order using shufpd.
    SmallVector<SDOperand, 8> MaskVec;
    if (BestLowQuad != -1)
      MaskVec.push_back(DAG.getConstant(BestLowQuad, MVT::i32));
    else
      MaskVec.push_back(DAG.getConstant(0, MVT::i32));
    if (BestHighQuad != -1)
      MaskVec.push_back(DAG.getConstant(BestHighQuad, MVT::i32));
    else
      MaskVec.push_back(DAG.getConstant(1, MVT::i32));
    SDOperand Mask= DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i32, &MaskVec[0],2);
    NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64,
                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V1),
                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2i64, V2), Mask);
    NewV = DAG.getNode(ISD::BIT_CONVERT, MVT::v8i16, NewV);

    // Now sort high and low parts separately.
    BitVector InOrder(8);
    if (BestLowQuad != -1) {
      // Sort lower half in order using PSHUFLW.
      MaskVec.clear();
      bool AnyOutOrder = false;
      for (unsigned i = 0; i != 4; ++i) {
        SDOperand Elt = MaskElts[i];
        if (Elt.getOpcode() == ISD::UNDEF) {
          MaskVec.push_back(Elt);
          InOrder.set(i);
        } else {
          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
          if (EltIdx != i)
            AnyOutOrder = true;
          MaskVec.push_back(DAG.getConstant(EltIdx % 4, MaskEVT));
          // If this element is in the right place after this shuffle, then
          // remember it.
          if ((int)(EltIdx / 4) == BestLowQuad)
            InOrder.set(i);
        }
      }
      if (AnyOutOrder) {
        for (unsigned i = 4; i != 8; ++i)
          MaskVec.push_back(DAG.getConstant(i, MaskEVT));
        SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask);
      }
    }

    if (BestHighQuad != -1) {
      // Sort high half in order using PSHUFHW if possible.
      MaskVec.clear();
      for (unsigned i = 0; i != 4; ++i)
        MaskVec.push_back(DAG.getConstant(i, MaskEVT));
      bool AnyOutOrder = false;
      for (unsigned i = 4; i != 8; ++i) {
        SDOperand Elt = MaskElts[i];
        if (Elt.getOpcode() == ISD::UNDEF) {
          MaskVec.push_back(Elt);
          InOrder.set(i);
        } else {
          unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
          if (EltIdx != i)
            AnyOutOrder = true;
          MaskVec.push_back(DAG.getConstant((EltIdx % 4) + 4, MaskEVT));
          // If this element is in the right place after this shuffle, then
          // remember it.
          if ((int)(EltIdx / 4) == BestHighQuad)
            InOrder.set(i);
        }
      }
      if (AnyOutOrder) {
        SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
        NewV = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, NewV, NewV, Mask);
      }
    }

    // The other elements are put in the right place using pextrw and pinsrw.
    for (unsigned i = 0; i != 8; ++i) {
      if (InOrder[i])
        continue;
      SDOperand Elt = MaskElts[i];
      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
      if (EltIdx == i)
        continue;
      SDOperand ExtOp = (EltIdx < 8)
        ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
                      DAG.getConstant(EltIdx, PtrVT))
        : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
                      DAG.getConstant(EltIdx - 8, PtrVT));
      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
                         DAG.getConstant(i, PtrVT));
    }
    return NewV;
  }

  // PSHUF{H|L}W are not used. Lower into extracts and inserts but try to use
  ///as few as possible.
  // First, let's find out how many elements are already in the right order.
  unsigned V1InOrder = 0;
  unsigned V1FromV1 = 0;
  unsigned V2InOrder = 0;
  unsigned V2FromV2 = 0;
  SmallVector<SDOperand, 8> V1Elts;
  SmallVector<SDOperand, 8> V2Elts;
  for (unsigned i = 0; i < 8; ++i) {
    SDOperand Elt = MaskElts[i];
    if (Elt.getOpcode() == ISD::UNDEF) {
      V1Elts.push_back(Elt);
      V2Elts.push_back(Elt);
      ++V1InOrder;
      ++V2InOrder;
      continue;
    }
    unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
    if (EltIdx == i) {
      V1Elts.push_back(Elt);
      V2Elts.push_back(DAG.getConstant(i+8, MaskEVT));
      ++V1InOrder;
    } else if (EltIdx == i+8) {
      V1Elts.push_back(Elt);
      V2Elts.push_back(DAG.getConstant(i, MaskEVT));
      ++V2InOrder;
    } else if (EltIdx < 8) {
      V1Elts.push_back(Elt);
      ++V1FromV1;
    } else {
      V2Elts.push_back(DAG.getConstant(EltIdx-8, MaskEVT));
      ++V2FromV2;
    }
  }

  if (V2InOrder > V1InOrder) {
    PermMask = CommuteVectorShuffleMask(PermMask, DAG);
    std::swap(V1, V2);
    std::swap(V1Elts, V2Elts);
    std::swap(V1FromV1, V2FromV2);
  }

  if ((V1FromV1 + V1InOrder) != 8) {
    // Some elements are from V2.
    if (V1FromV1) {
      // If there are elements that are from V1 but out of place,
      // then first sort them in place
      SmallVector<SDOperand, 8> MaskVec;
      for (unsigned i = 0; i < 8; ++i) {
        SDOperand Elt = V1Elts[i];
        if (Elt.getOpcode() == ISD::UNDEF) {
          MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
          continue;
        }
        unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
        if (EltIdx >= 8)
          MaskVec.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
        else
          MaskVec.push_back(DAG.getConstant(EltIdx, MaskEVT));
      }
      SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &MaskVec[0], 8);
      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v8i16, V1, V1, Mask);
    }

    NewV = V1;
    for (unsigned i = 0; i < 8; ++i) {
      SDOperand Elt = V1Elts[i];
      if (Elt.getOpcode() == ISD::UNDEF)
        continue;
      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
      if (EltIdx < 8)
        continue;
      SDOperand ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V2,
                                    DAG.getConstant(EltIdx - 8, PtrVT));
      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
                         DAG.getConstant(i, PtrVT));
    }
    return NewV;
  } else {
    // All elements are from V1.
    NewV = V1;
    for (unsigned i = 0; i < 8; ++i) {
      SDOperand Elt = V1Elts[i];
      if (Elt.getOpcode() == ISD::UNDEF)
        continue;
      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
      SDOperand ExtOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i16, V1,
                                    DAG.getConstant(EltIdx, PtrVT));
      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, MVT::v8i16, NewV, ExtOp,
                         DAG.getConstant(i, PtrVT));
    }
    return NewV;
  }
}

/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
/// done when every pair / quad of shuffle mask elements point to elements in
/// the right sequence. e.g.
/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
static
SDOperand RewriteAsNarrowerShuffle(SDOperand V1, SDOperand V2,
                                MVT::ValueType VT,
                                SDOperand PermMask, SelectionDAG &DAG,
                                TargetLowering &TLI) {
  unsigned NumElems = PermMask.getNumOperands();
  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
  MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
  MVT::ValueType NewVT = MaskVT;
  switch (VT) {
  case MVT::v4f32: NewVT = MVT::v2f64; break;
  case MVT::v4i32: NewVT = MVT::v2i64; break;
  case MVT::v8i16: NewVT = MVT::v4i32; break;
  case MVT::v16i8: NewVT = MVT::v4i32; break;
  default: assert(false && "Unexpected!");
  }

  if (NewWidth == 2) {
    if (MVT::isInteger(VT))
      NewVT = MVT::v2i64;
    else
      NewVT = MVT::v2f64;
  }
  unsigned Scale = NumElems / NewWidth;
  SmallVector<SDOperand, 8> MaskVec;
  for (unsigned i = 0; i < NumElems; i += Scale) {
    unsigned StartIdx = ~0U;
    for (unsigned j = 0; j < Scale; ++j) {
      SDOperand Elt = PermMask.getOperand(i+j);
      if (Elt.getOpcode() == ISD::UNDEF)
        continue;
      unsigned EltIdx = cast<ConstantSDNode>(Elt)->getValue();
      if (StartIdx == ~0U)
        StartIdx = EltIdx - (EltIdx % Scale);
      if (EltIdx != StartIdx + j)
        return SDOperand();
    }
    if (StartIdx == ~0U)
      MaskVec.push_back(DAG.getNode(ISD::UNDEF, MVT::i32));
    else
      MaskVec.push_back(DAG.getConstant(StartIdx / Scale, MVT::i32));
  }

  V1 = DAG.getNode(ISD::BIT_CONVERT, NewVT, V1);
  V2 = DAG.getNode(ISD::BIT_CONVERT, NewVT, V2);
  return DAG.getNode(ISD::VECTOR_SHUFFLE, NewVT, V1, V2,
                     DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                 &MaskVec[0], MaskVec.size()));
}

SDOperand
X86TargetLowering::LowerVECTOR_SHUFFLE(SDOperand Op, SelectionDAG &DAG) {
  SDOperand V1 = Op.getOperand(0);
  SDOperand V2 = Op.getOperand(1);
  SDOperand PermMask = Op.getOperand(2);
  MVT::ValueType VT = Op.getValueType();
  unsigned NumElems = PermMask.getNumOperands();
  bool isMMX = MVT::getSizeInBits(VT) == 64;
  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
  bool V1IsSplat = false;
  bool V2IsSplat = false;

  if (isUndefShuffle(Op.Val))
    return DAG.getNode(ISD::UNDEF, VT);

  if (isZeroShuffle(Op.Val))
    return getZeroVector(VT, DAG);

  if (isIdentityMask(PermMask.Val))
    return V1;
  else if (isIdentityMask(PermMask.Val, true))
    return V2;

  if (isSplatMask(PermMask.Val)) {
    if (isMMX || NumElems < 4) return Op;
    // Promote it to a v4{if}32 splat.
    return PromoteSplat(Op, DAG, Subtarget->hasSSE2());
  }

  // If the shuffle can be profitably rewritten as a narrower shuffle, then
  // do it!
  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
    SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
    if (NewOp.Val)
      return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
    // FIXME: Figure out a cleaner way to do this.
    // Try to make use of movq to zero out the top part.
    if (ISD::isBuildVectorAllZeros(V2.Val)) {
      SDOperand NewOp = RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
      if (NewOp.Val) {
        SDOperand NewV1 = NewOp.getOperand(0);
        SDOperand NewV2 = NewOp.getOperand(1);
        SDOperand NewMask = NewOp.getOperand(2);
        if (isCommutedMOVL(NewMask.Val, true, false)) {
          NewOp = CommuteVectorShuffle(NewOp, NewV1, NewV2, NewMask, DAG);
          NewOp = DAG.getNode(ISD::VECTOR_SHUFFLE, NewOp.getValueType(),
                              NewV1, NewV2, getMOVLMask(2, DAG));
          return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
        }
      }
    } else if (ISD::isBuildVectorAllZeros(V1.Val)) {
      SDOperand NewOp= RewriteAsNarrowerShuffle(V1, V2, VT, PermMask, DAG, *this);
      if (NewOp.Val && X86::isMOVLMask(NewOp.getOperand(2).Val))
        return DAG.getNode(ISD::BIT_CONVERT, VT, LowerVECTOR_SHUFFLE(NewOp, DAG));
    }
  }

  if (X86::isMOVLMask(PermMask.Val))
    return (V1IsUndef) ? V2 : Op;

  if (X86::isMOVSHDUPMask(PermMask.Val) ||
      X86::isMOVSLDUPMask(PermMask.Val) ||
      X86::isMOVHLPSMask(PermMask.Val) ||
      X86::isMOVHPMask(PermMask.Val) ||
      X86::isMOVLPMask(PermMask.Val))
    return Op;

  if (ShouldXformToMOVHLPS(PermMask.Val) ||
      ShouldXformToMOVLP(V1.Val, V2.Val, PermMask.Val))
    return CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);

  bool Commuted = false;
  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
  // 1,1,1,1 -> v8i16 though.
  V1IsSplat = isSplatVector(V1.Val);
  V2IsSplat = isSplatVector(V2.Val);
  
  // Canonicalize the splat or undef, if present, to be on the RHS.
  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    std::swap(V1IsSplat, V2IsSplat);
    std::swap(V1IsUndef, V2IsUndef);
    Commuted = true;
  }

  // FIXME: Figure out a cleaner way to do this.
  if (isCommutedMOVL(PermMask.Val, V2IsSplat, V2IsUndef)) {
    if (V2IsUndef) return V1;
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    if (V2IsSplat) {
      // V2 is a splat, so the mask may be malformed. That is, it may point
      // to any V2 element. The instruction selectior won't like this. Get
      // a corrected mask and commute to form a proper MOVS{S|D}.
      SDOperand NewMask = getMOVLMask(NumElems, DAG);
      if (NewMask.Val != PermMask.Val)
        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
    }
    return Op;
  }

  if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
      X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
      X86::isUNPCKLMask(PermMask.Val) ||
      X86::isUNPCKHMask(PermMask.Val))
    return Op;

  if (V2IsSplat) {
    // Normalize mask so all entries that point to V2 points to its first
    // element then try to match unpck{h|l} again. If match, return a
    // new vector_shuffle with the corrected mask.
    SDOperand NewMask = NormalizeMask(PermMask, DAG);
    if (NewMask.Val != PermMask.Val) {
      if (X86::isUNPCKLMask(PermMask.Val, true)) {
        SDOperand NewMask = getUnpacklMask(NumElems, DAG);
        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
      } else if (X86::isUNPCKHMask(PermMask.Val, true)) {
        SDOperand NewMask = getUnpackhMask(NumElems, DAG);
        return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, NewMask);
      }
    }
  }

  // Normalize the node to match x86 shuffle ops if needed
  if (V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(PermMask.Val))
      Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);

  if (Commuted) {
    // Commute is back and try unpck* again.
    Op = CommuteVectorShuffle(Op, V1, V2, PermMask, DAG);
    if (X86::isUNPCKL_v_undef_Mask(PermMask.Val) ||
        X86::isUNPCKH_v_undef_Mask(PermMask.Val) ||
        X86::isUNPCKLMask(PermMask.Val) ||
        X86::isUNPCKHMask(PermMask.Val))
      return Op;
  }

  // Try PSHUF* first, then SHUFP*.
  // MMX doesn't have PSHUFD but it does have PSHUFW. While it's theoretically
  // possible to shuffle a v2i32 using PSHUFW, that's not yet implemented.
  if (isMMX && NumElems == 4 && X86::isPSHUFDMask(PermMask.Val)) {
    if (V2.getOpcode() != ISD::UNDEF)
      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1,
                         DAG.getNode(ISD::UNDEF, VT), PermMask);
    return Op;
  }

  if (!isMMX) {
    if (Subtarget->hasSSE2() &&
        (X86::isPSHUFDMask(PermMask.Val) ||
         X86::isPSHUFHWMask(PermMask.Val) ||
         X86::isPSHUFLWMask(PermMask.Val))) {
      MVT::ValueType RVT = VT;
      if (VT == MVT::v4f32) {
        RVT = MVT::v4i32;
        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT,
                         DAG.getNode(ISD::BIT_CONVERT, RVT, V1),
                         DAG.getNode(ISD::UNDEF, RVT), PermMask);
      } else if (V2.getOpcode() != ISD::UNDEF)
        Op = DAG.getNode(ISD::VECTOR_SHUFFLE, RVT, V1,
                         DAG.getNode(ISD::UNDEF, RVT), PermMask);
      if (RVT != VT)
        Op = DAG.getNode(ISD::BIT_CONVERT, VT, Op);
      return Op;
    }

    // Binary or unary shufps.
    if (X86::isSHUFPMask(PermMask.Val) ||
        (V2.getOpcode() == ISD::UNDEF && X86::isPSHUFDMask(PermMask.Val)))
      return Op;
  }

  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
  if (VT == MVT::v8i16) {
    SDOperand NewOp = LowerVECTOR_SHUFFLEv8i16(V1, V2, PermMask, DAG, *this);
    if (NewOp.Val)
      return NewOp;
  }

  // Handle all 4 wide cases with a number of shuffles.
  if (NumElems == 4 && !isMMX) {
    // Don't do this for MMX.
    MVT::ValueType MaskVT = PermMask.getValueType();
    MVT::ValueType MaskEVT = MVT::getVectorElementType(MaskVT);
    SmallVector<std::pair<int, int>, 8> Locs;
    Locs.reserve(NumElems);
    SmallVector<SDOperand, 8> Mask1(NumElems,
                                    DAG.getNode(ISD::UNDEF, MaskEVT));
    SmallVector<SDOperand, 8> Mask2(NumElems,
                                    DAG.getNode(ISD::UNDEF, MaskEVT));
    unsigned NumHi = 0;
    unsigned NumLo = 0;
    // If no more than two elements come from either vector. This can be
    // implemented with two shuffles. First shuffle gather the elements.
    // The second shuffle, which takes the first shuffle as both of its
    // vector operands, put the elements into the right order.
    for (unsigned i = 0; i != NumElems; ++i) {
      SDOperand Elt = PermMask.getOperand(i);
      if (Elt.getOpcode() == ISD::UNDEF) {
        Locs[i] = std::make_pair(-1, -1);
      } else {
        unsigned Val = cast<ConstantSDNode>(Elt)->getValue();
        if (Val < NumElems) {
          Locs[i] = std::make_pair(0, NumLo);
          Mask1[NumLo] = Elt;
          NumLo++;
        } else {
          Locs[i] = std::make_pair(1, NumHi);
          if (2+NumHi < NumElems)
            Mask1[2+NumHi] = Elt;
          NumHi++;
        }
      }
    }
    if (NumLo <= 2 && NumHi <= 2) {
      V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                   &Mask1[0], Mask1.size()));
      for (unsigned i = 0; i != NumElems; ++i) {
        if (Locs[i].first == -1)
          continue;
        else {
          unsigned Idx = (i < NumElems/2) ? 0 : NumElems;
          Idx += Locs[i].first * (NumElems/2) + Locs[i].second;
          Mask2[i] = DAG.getConstant(Idx, MaskEVT);
        }
      }

      return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1,
                         DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                     &Mask2[0], Mask2.size()));
    }

    // Break it into (shuffle shuffle_hi, shuffle_lo).
    Locs.clear();
    SmallVector<SDOperand,8> LoMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
    SmallVector<SDOperand,8> HiMask(NumElems, DAG.getNode(ISD::UNDEF, MaskEVT));
    SmallVector<SDOperand,8> *MaskPtr = &LoMask;
    unsigned MaskIdx = 0;
    unsigned LoIdx = 0;
    unsigned HiIdx = NumElems/2;
    for (unsigned i = 0; i != NumElems; ++i) {
      if (i == NumElems/2) {
        MaskPtr = &HiMask;
        MaskIdx = 1;
        LoIdx = 0;
        HiIdx = NumElems/2;
      }
      SDOperand Elt = PermMask.getOperand(i);
      if (Elt.getOpcode() == ISD::UNDEF) {
        Locs[i] = std::make_pair(-1, -1);
      } else if (cast<ConstantSDNode>(Elt)->getValue() < NumElems) {
        Locs[i] = std::make_pair(MaskIdx, LoIdx);
        (*MaskPtr)[LoIdx] = Elt;
        LoIdx++;
      } else {
        Locs[i] = std::make_pair(MaskIdx, HiIdx);
        (*MaskPtr)[HiIdx] = Elt;
        HiIdx++;
      }
    }

    SDOperand LoShuffle =
      DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
                  DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                              &LoMask[0], LoMask.size()));
    SDOperand HiShuffle =
      DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2,
                  DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                              &HiMask[0], HiMask.size()));
    SmallVector<SDOperand, 8> MaskOps;
    for (unsigned i = 0; i != NumElems; ++i) {
      if (Locs[i].first == -1) {
        MaskOps.push_back(DAG.getNode(ISD::UNDEF, MaskEVT));
      } else {
        unsigned Idx = Locs[i].first * NumElems + Locs[i].second;
        MaskOps.push_back(DAG.getConstant(Idx, MaskEVT));
      }
    }
    return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, LoShuffle, HiShuffle,
                       DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                   &MaskOps[0], MaskOps.size()));
  }

  return SDOperand();
}

SDOperand
X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDOperand Op,
                                                SelectionDAG &DAG) {
  MVT::ValueType VT = Op.getValueType();
  if (MVT::getSizeInBits(VT) == 8) {
    SDOperand Extract = DAG.getNode(X86ISD::PEXTRB, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
    SDOperand Assert  = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
  } else if (MVT::getSizeInBits(VT) == 16) {
    SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, MVT::i32,
                                    Op.getOperand(0), Op.getOperand(1));
    SDOperand Assert  = DAG.getNode(ISD::AssertZext, MVT::i32, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
  } else if (VT == MVT::f32) {
    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
    // the result back to FR32 register. It's only worth matching if the
    // result has a single use which is a store or a bitcast to i32.
    if (!Op.hasOneUse())
      return SDOperand();
    SDNode *User = Op.Val->use_begin()->getUser();
    if (User->getOpcode() != ISD::STORE &&
        (User->getOpcode() != ISD::BIT_CONVERT ||
         User->getValueType(0) != MVT::i32))
      return SDOperand();
    SDOperand Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32,
                    DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Op.getOperand(0)),
                                    Op.getOperand(1));
    return DAG.getNode(ISD::BIT_CONVERT, MVT::f32, Extract);
  }
  return SDOperand();
}


SDOperand
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
  if (!isa<ConstantSDNode>(Op.getOperand(1)))
    return SDOperand();

  if (Subtarget->hasSSE41()) {
    SDOperand Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
    if (Res.Val)
      return Res;
  }

  MVT::ValueType VT = Op.getValueType();
  // TODO: handle v16i8.
  if (MVT::getSizeInBits(VT) == 16) {
    SDOperand Vec = Op.getOperand(0);
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
    if (Idx == 0)
      return DAG.getNode(ISD::TRUNCATE, MVT::i16,
                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i32,
                                 DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, Vec),
                                     Op.getOperand(1)));
    // Transform it so it match pextrw which produces a 32-bit result.
    MVT::ValueType EVT = (MVT::ValueType)(VT+1);
    SDOperand Extract = DAG.getNode(X86ISD::PEXTRW, EVT,
                                    Op.getOperand(0), Op.getOperand(1));
    SDOperand Assert  = DAG.getNode(ISD::AssertZext, EVT, Extract,
                                    DAG.getValueType(VT));
    return DAG.getNode(ISD::TRUNCATE, VT, Assert);
  } else if (MVT::getSizeInBits(VT) == 32) {
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
    if (Idx == 0)
      return Op;
    // SHUFPS the element to the lowest double word, then movss.
    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
    SmallVector<SDOperand, 8> IdxVec;
    IdxVec.
      push_back(DAG.getConstant(Idx, MVT::getVectorElementType(MaskVT)));
    IdxVec.
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    IdxVec.
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    IdxVec.
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                 &IdxVec[0], IdxVec.size());
    SDOperand Vec = Op.getOperand(0);
    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
                       DAG.getIntPtrConstant(0));
  } else if (MVT::getSizeInBits(VT) == 64) {
    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
    //        to match extract_elt for f64.
    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
    if (Idx == 0)
      return Op;

    // UNPCKHPD the element to the lowest double word, then movsd.
    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
    MVT::ValueType MaskVT = MVT::getIntVectorWithNumElements(4);
    SmallVector<SDOperand, 8> IdxVec;
    IdxVec.push_back(DAG.getConstant(1, MVT::getVectorElementType(MaskVT)));
    IdxVec.
      push_back(DAG.getNode(ISD::UNDEF, MVT::getVectorElementType(MaskVT)));
    SDOperand Mask = DAG.getNode(ISD::BUILD_VECTOR, MaskVT,
                                 &IdxVec[0], IdxVec.size());
    SDOperand Vec = Op.getOperand(0);
    Vec = DAG.getNode(ISD::VECTOR_SHUFFLE, Vec.getValueType(),
                      Vec, DAG.getNode(ISD::UNDEF, Vec.getValueType()), Mask);
    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, VT, Vec,
                       DAG.getIntPtrConstant(0));
  }

  return SDOperand();
}

SDOperand
X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDOperand Op, SelectionDAG &DAG){
  MVT::ValueType VT = Op.getValueType();
  MVT::ValueType EVT = MVT::getVectorElementType(VT);

  SDOperand N0 = Op.getOperand(0);
  SDOperand N1 = Op.getOperand(1);
  SDOperand N2 = Op.getOperand(2);

  if ((MVT::getSizeInBits(EVT) == 8) || (MVT::getSizeInBits(EVT) == 16)) {
    unsigned Opc = (MVT::getSizeInBits(EVT) == 8) ? X86ISD::PINSRB
                                                  : X86ISD::PINSRW;
    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
    // argument.
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue());
    return DAG.getNode(Opc, VT, N0, N1, N2);
  } else if (EVT == MVT::f32) {
    // Bits [7:6] of the constant are the source select.  This will always be
    //  zero here.  The DAG Combiner may combine an extract_elt index into these
    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
    // Bits [5:4] of the constant are the destination select.  This is the 
    //  value of the incoming immediate.
    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may 
    //   combine either bitwise AND or insert of float 0.0 to set these bits.
    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue() << 4);
    return DAG.getNode(X86ISD::INSERTPS, VT, N0, N1, N2);
  }
  return SDOperand();
}

SDOperand
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDOperand Op, SelectionDAG &DAG) {
  MVT::ValueType VT = Op.getValueType();
  MVT::ValueType EVT = MVT::getVectorElementType(VT);

  if (Subtarget->hasSSE41())
    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);

  if (EVT == MVT::i8)
    return SDOperand();

  SDOperand N0 = Op.getOperand(0);
  SDOperand N1 = Op.getOperand(1);
  SDOperand N2 = Op.getOperand(2);

  if (MVT::getSizeInBits(EVT) == 16) {
    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
    // as its second argument.
    if (N1.getValueType() != MVT::i32)
      N1 = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, N1);
    if (N2.getValueType() != MVT::i32)
      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getValue());
    return DAG.getNode(X86ISD::PINSRW, VT, N0, N1, N2);
  }
  return SDOperand();
}

SDOperand
X86TargetLowering::LowerSCALAR_TO_VECTOR(SDOperand Op, SelectionDAG &DAG) {
  SDOperand AnyExt = DAG.getNode(ISD::ANY_EXTEND, MVT::i32, Op.getOperand(0));
  MVT::ValueType VT = MVT::v2i32;
  switch (Op.getValueType()) {
  default: break;
  case MVT::v16i8:
  case MVT::v8i16:
    VT = MVT::v4i32;
    break;
  }
  return DAG.getNode(ISD::BIT_CONVERT, Op.getValueType(),
                     DAG.getNode(ISD::SCALAR_TO_VECTOR, VT, AnyExt));
}

// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
// one of the above mentioned nodes. It has to be wrapped because otherwise
// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
// be used to form addressing mode. These wrapped nodes will be selected
// into MOV32ri.
SDOperand
X86TargetLowering::LowerConstantPool(SDOperand Op, SelectionDAG &DAG) {
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  SDOperand Result = DAG.getTargetConstantPool(CP->getConstVal(),
                                               getPointerTy(),
                                               CP->getAlignment());
  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                         Result);
  }

  return Result;
}

SDOperand
X86TargetLowering::LowerGlobalAddress(SDOperand Op, SelectionDAG &DAG) {
  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
  SDOperand Result = DAG.getTargetGlobalAddress(GV, getPointerTy());
  // If it's a debug information descriptor, don't mess with it.
  if (DAG.isVerifiedDebugInfoDesc(Op))
    return Result;
  Result = DAG.getNode(X86ISD::Wrapper, getPointerTy(), Result);
  // With PIC, the address is actually $g + Offset.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      !Subtarget->isPICStyleRIPRel()) {
    Result = DAG.getNode(ISD::ADD, getPointerTy(),
                         DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                         Result);
  }
  
  // For Darwin & Mingw32, external and weak symbols are indirect, so we want to
  // load the value at address GV, not the value of GV itself. This means that
  // the GlobalAddress must be in the base or index register of the address, not
  // the GV offset field. Platform check is inside GVRequiresExtraLoad() call
  // The same applies for external symbols during PIC codegen