X86ISelLowering.cpp

  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
  // since the result of setcc_c is all zero's or all ones.
  if (N1C && N0.getOpcode() == ISD::AND &&
      N0.getOperand(1).getOpcode() == ISD::Constant) {
    SDValue N00 = N0.getOperand(0);
    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
        ((N00.getOpcode() == ISD::ANY_EXTEND ||
          N00.getOpcode() == ISD::ZERO_EXTEND) &&
         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
      APInt ShAmt = N1C->getAPIntValue();
      Mask = Mask.shl(ShAmt);
      if (Mask != 0)
        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
                           N00, DAG.getConstant(Mask, VT));
    }
  }

  return SDValue();
}

/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
///                       when possible.
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
  EVT VT = N->getValueType(0);
  if (!VT.isVector() && VT.isInteger() &&
      N->getOpcode() == ISD::SHL)
    return PerformSHLCombine(N, DAG);

  // On X86 with SSE2 support, we can transform this to a vector shift if
  // all elements are shifted by the same amount.  We can't do this in legalize
  // because the a constant vector is typically transformed to a constant pool
  // so we have no knowledge of the shift amount.
  if (!Subtarget->hasSSE2())
    return SDValue();

  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
    return SDValue();

  SDValue ShAmtOp = N->getOperand(1);
  EVT EltVT = VT.getVectorElementType();
  DebugLoc DL = N->getDebugLoc();
  SDValue BaseShAmt = SDValue();
  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
    unsigned NumElts = VT.getVectorNumElements();
    unsigned i = 0;
    for (; i != NumElts; ++i) {
      SDValue Arg = ShAmtOp.getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      BaseShAmt = Arg;
      break;
    }
    for (; i != NumElts; ++i) {
      SDValue Arg = ShAmtOp.getOperand(i);
      if (Arg.getOpcode() == ISD::UNDEF) continue;
      if (Arg != BaseShAmt) {
        return SDValue();
      }
    }
  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
    SDValue InVec = ShAmtOp.getOperand(0);
    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
      unsigned NumElts = InVec.getValueType().getVectorNumElements();
      unsigned i = 0;
      for (; i != NumElts; ++i) {
        SDValue Arg = InVec.getOperand(i);
        if (Arg.getOpcode() == ISD::UNDEF) continue;
        BaseShAmt = Arg;
        break;
      }
    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
         unsigned SplatIdx= cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
         if (C->getZExtValue() == SplatIdx)
           BaseShAmt = InVec.getOperand(1);
       }
    }
    if (BaseShAmt.getNode() == 0)
      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
                              DAG.getIntPtrConstant(0));
  } else
    return SDValue();

  // The shift amount is an i32.
  if (EltVT.bitsGT(MVT::i32))
    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
  else if (EltVT.bitsLT(MVT::i32))
    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);

  // The shift amount is identical so we can do a vector shift.
  SDValue  ValOp = N->getOperand(0);
  switch (N->getOpcode()) {
  default:
    llvm_unreachable("Unknown shift opcode!");
    break;
  case ISD::SHL:
    if (VT == MVT::v2i64)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
                         ValOp, BaseShAmt);
    if (VT == MVT::v4i32)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
                         ValOp, BaseShAmt);
    if (VT == MVT::v8i16)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
                         ValOp, BaseShAmt);
    break;
  case ISD::SRA:
    if (VT == MVT::v4i32)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
                         ValOp, BaseShAmt);
    if (VT == MVT::v8i16)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
                         ValOp, BaseShAmt);
    break;
  case ISD::SRL:
    if (VT == MVT::v2i64)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
                         ValOp, BaseShAmt);
    if (VT == MVT::v4i32)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
                         ValOp, BaseShAmt);
    if (VT ==  MVT::v8i16)
      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
                         ValOp, BaseShAmt);
    break;
  }
  return SDValue();
}


// CMPEQCombine - Recognize the distinctive  (AND (setcc ...) (setcc ..))
// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
// and friends.  Likewise for OR -> CMPNEQSS.
static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget *Subtarget) {
  unsigned opcode;

  // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
  // we're requiring SSE2 for both.
  if (Subtarget->hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
    SDValue N0 = N->getOperand(0);
    SDValue N1 = N->getOperand(1);
    SDValue CMP0 = N0->getOperand(1);
    SDValue CMP1 = N1->getOperand(1);
    DebugLoc DL = N->getDebugLoc();

    // The SETCCs should both refer to the same CMP.
    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
      return SDValue();

    SDValue CMP00 = CMP0->getOperand(0);
    SDValue CMP01 = CMP0->getOperand(1);
    EVT     VT    = CMP00.getValueType();

    if (VT == MVT::f32 || VT == MVT::f64) {
      bool ExpectingFlags = false;
      // Check for any users that want flags:
      for (SDNode::use_iterator UI = N->use_begin(),
             UE = N->use_end();
           !ExpectingFlags && UI != UE; ++UI)
        switch (UI->getOpcode()) {
        default:
        case ISD::BR_CC:
        case ISD::BRCOND:
        case ISD::SELECT:
          ExpectingFlags = true;
          break;
        case ISD::CopyToReg:
        case ISD::SIGN_EXTEND:
        case ISD::ZERO_EXTEND:
        case ISD::ANY_EXTEND:
          break;
        }

      if (!ExpectingFlags) {
        enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
        enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

        if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
          X86::CondCode tmp = cc0;
          cc0 = cc1;
          cc1 = tmp;
        }

        if ((cc0 == X86::COND_E  && cc1 == X86::COND_NP) ||
            (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
          bool is64BitFP = (CMP00.getValueType() == MVT::f64);
          X86ISD::NodeType NTOperator = is64BitFP ?
            X86ISD::FSETCCsd : X86ISD::FSETCCss;
          // FIXME: need symbolic constants for these magic numbers.
          // See X86ATTInstPrinter.cpp:printSSECC().
          unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
          SDValue OnesOrZeroesF = DAG.getNode(NTOperator, DL, MVT::f32, CMP00, CMP01,
                                              DAG.getConstant(x86cc, MVT::i8));
          SDValue OnesOrZeroesI = DAG.getNode(ISD::BITCAST, DL, MVT::i32,
                                              OnesOrZeroesF);
          SDValue ANDed = DAG.getNode(ISD::AND, DL, MVT::i32, OnesOrZeroesI,
                                      DAG.getConstant(1, MVT::i32));
          SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ANDed);
          return OneBitOfTruth;
        }
      }
    }
  }
  return SDValue();
}

/// CanFoldXORWithAllOnes - Test whether the XOR operand is a AllOnes vector
/// so it can be folded inside ANDNP.
static bool CanFoldXORWithAllOnes(const SDNode *N) {
  EVT VT = N->getValueType(0);

  // Match direct AllOnes for 128 and 256-bit vectors
  if (ISD::isBuildVectorAllOnes(N))
    return true;

  // Look through a bit convert.
  if (N->getOpcode() == ISD::BITCAST)
    N = N->getOperand(0).getNode();

  // Sometimes the operand may come from a insert_subvector building a 256-bit
  // allones vector
  SDValue V1 = N->getOperand(0);
  SDValue V2 = N->getOperand(1);

  if (VT.getSizeInBits() == 256 &&
      N->getOpcode() == ISD::INSERT_SUBVECTOR &&
      V1.getOpcode() == ISD::INSERT_SUBVECTOR &&
      V1.getOperand(0).getOpcode() == ISD::UNDEF &&
      ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) &&
      ISD::isBuildVectorAllOnes(V2.getNode()))
    return true;

  return false;
}

static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI,
                                 const X86Subtarget *Subtarget) {
  if (DCI.isBeforeLegalizeOps())
    return SDValue();

  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
  if (R.getNode())
    return R;

  // Want to form ANDNP nodes:
  // 1) In the hopes of then easily combining them with OR and AND nodes
  //    to form PBLEND/PSIGN.
  // 2) To match ANDN packed intrinsics
  EVT VT = N->getValueType(0);
  if (VT != MVT::v2i64 && VT != MVT::v4i64)
    return SDValue();

  SDValue N0 = N->getOperand(0);
  SDValue N1 = N->getOperand(1);
  DebugLoc DL = N->getDebugLoc();

  // Check LHS for vnot
  if (N0.getOpcode() == ISD::XOR &&
      //ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
      CanFoldXORWithAllOnes(N0.getOperand(1).getNode()))
    return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

  // Check RHS for vnot
  if (N1.getOpcode() == ISD::XOR &&
      //ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
      CanFoldXORWithAllOnes(N1.getOperand(1).getNode()))
    return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

  return SDValue();
}

static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
                                TargetLowering::DAGCombinerInfo &DCI,
                                const X86Subtarget *Subtarget) {
  if (DCI.isBeforeLegalizeOps())
    return SDValue();

  SDValue R = CMPEQCombine(N, DAG, DCI, Subtarget);
  if (R.getNode())
    return R;

  EVT VT = N->getValueType(0);
  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
    return SDValue();

  SDValue N0 = N->getOperand(0);
  SDValue N1 = N->getOperand(1);

  // look for psign/blend
  if (Subtarget->hasSSSE3()) {
    if (VT == MVT::v2i64) {
      // Canonicalize pandn to RHS
      if (N0.getOpcode() == X86ISD::ANDNP)
        std::swap(N0, N1);
      // or (and (m, x), (pandn m, y))
      if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
        SDValue Mask = N1.getOperand(0);
        SDValue X    = N1.getOperand(1);
        SDValue Y;
        if (N0.getOperand(0) == Mask)
          Y = N0.getOperand(1);
        if (N0.getOperand(1) == Mask)
          Y = N0.getOperand(0);

        // Check to see if the mask appeared in both the AND and ANDNP and
        if (!Y.getNode())
          return SDValue();

        // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
        if (Mask.getOpcode() != ISD::BITCAST ||
            X.getOpcode() != ISD::BITCAST ||
            Y.getOpcode() != ISD::BITCAST)
          return SDValue();

        // Look through mask bitcast.
        Mask = Mask.getOperand(0);
        EVT MaskVT = Mask.getValueType();

        // Validate that the Mask operand is a vector sra node.  The sra node
        // will be an intrinsic.
        if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
          return SDValue();

        // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
        // there is no psrai.b
        switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
        case Intrinsic::x86_sse2_psrai_w:
        case Intrinsic::x86_sse2_psrai_d:
          break;
        default: return SDValue();
        }

        // Check that the SRA is all signbits.
        SDValue SraC = Mask.getOperand(2);
        unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
        unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
        if ((SraAmt + 1) != EltBits)
          return SDValue();

        DebugLoc DL = N->getDebugLoc();

        // Now we know we at least have a plendvb with the mask val.  See if
        // we can form a psignb/w/d.
        // psign = x.type == y.type == mask.type && y = sub(0, x);
        X = X.getOperand(0);
        Y = Y.getOperand(0);
        if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
            ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
            X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){
          unsigned Opc = 0;
          switch (EltBits) {
          case 8: Opc = X86ISD::PSIGNB; break;
          case 16: Opc = X86ISD::PSIGNW; break;
          case 32: Opc = X86ISD::PSIGND; break;
          default: break;
          }
          if (Opc) {
            SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1));
            return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign);
          }
        }
        // PBLENDVB only available on SSE 4.1
        if (!Subtarget->hasSSE41())
          return SDValue();

        X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
        Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
        Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
        Mask = DAG.getNode(X86ISD::PBLENDVB, DL, MVT::v16i8, X, Y, Mask);
        return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
      }
    }
  }

  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
    std::swap(N0, N1);
  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
    return SDValue();
  if (!N0.hasOneUse() || !N1.hasOneUse())
    return SDValue();

  SDValue ShAmt0 = N0.getOperand(1);
  if (ShAmt0.getValueType() != MVT::i8)
    return SDValue();
  SDValue ShAmt1 = N1.getOperand(1);
  if (ShAmt1.getValueType() != MVT::i8)
    return SDValue();
  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
    ShAmt0 = ShAmt0.getOperand(0);
  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
    ShAmt1 = ShAmt1.getOperand(0);

  DebugLoc DL = N->getDebugLoc();
  unsigned Opc = X86ISD::SHLD;
  SDValue Op0 = N0.getOperand(0);
  SDValue Op1 = N1.getOperand(0);
  if (ShAmt0.getOpcode() == ISD::SUB) {
    Opc = X86ISD::SHRD;
    std::swap(Op0, Op1);
    std::swap(ShAmt0, ShAmt1);
  }

  unsigned Bits = VT.getSizeInBits();
  if (ShAmt1.getOpcode() == ISD::SUB) {
    SDValue Sum = ShAmt1.getOperand(0);
    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
      if (ShAmt1Op1.getNode()->getOpcode() == ISD::TRUNCATE)
        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
      if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
        return DAG.getNode(Opc, DL, VT,
                           Op0, Op1,
                           DAG.getNode(ISD::TRUNCATE, DL,
                                       MVT::i8, ShAmt0));
    }
  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
    if (ShAmt0C &&
        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == Bits)
      return DAG.getNode(Opc, DL, VT,
                         N0.getOperand(0), N1.getOperand(0),
                         DAG.getNode(ISD::TRUNCATE, DL,
                                       MVT::i8, ShAmt0));
  }

  return SDValue();
}

/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                   const X86Subtarget *Subtarget) {
  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
  // the FP state in cases where an emms may be missing.
  // A preferable solution to the general problem is to figure out the right
  // places to insert EMMS.  This qualifies as a quick hack.

  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
  StoreSDNode *St = cast<StoreSDNode>(N);
  EVT VT = St->getValue().getValueType();
  if (VT.getSizeInBits() != 64)
    return SDValue();

  const Function *F = DAG.getMachineFunction().getFunction();
  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
    && Subtarget->hasSSE2();
  if ((VT.isVector() ||
       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
      isa<LoadSDNode>(St->getValue()) &&
      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
      St->getChain().hasOneUse() && !St->isVolatile()) {
    SDNode* LdVal = St->getValue().getNode();
    LoadSDNode *Ld = 0;
    int TokenFactorIndex = -1;
    SmallVector<SDValue, 8> Ops;
    SDNode* ChainVal = St->getChain().getNode();
    // Must be a store of a load.  We currently handle two cases:  the load
    // is a direct child, and it's under an intervening TokenFactor.  It is
    // possible to dig deeper under nested TokenFactors.
    if (ChainVal == LdVal)
      Ld = cast<LoadSDNode>(St->getChain());
    else if (St->getValue().hasOneUse() &&
             ChainVal->getOpcode() == ISD::TokenFactor) {
      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
        if (ChainVal->getOperand(i).getNode() == LdVal) {
          TokenFactorIndex = i;
          Ld = cast<LoadSDNode>(St->getValue());
        } else
          Ops.push_back(ChainVal->getOperand(i));
      }
    }

    if (!Ld || !ISD::isNormalLoad(Ld))
      return SDValue();

    // If this is not the MMX case, i.e. we are just turning i64 load/store
    // into f64 load/store, avoid the transformation if there are multiple
    // uses of the loaded value.
    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
      return SDValue();

    DebugLoc LdDL = Ld->getDebugLoc();
    DebugLoc StDL = N->getDebugLoc();
    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
    // pair instead.
    if (Subtarget->is64Bit() || F64IsLegal) {
      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
                                  Ld->getPointerInfo(), Ld->isVolatile(),
                                  Ld->isNonTemporal(), Ld->getAlignment());
      SDValue NewChain = NewLd.getValue(1);
      if (TokenFactorIndex != -1) {
        Ops.push_back(NewChain);
        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
                               Ops.size());
      }
      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
                          St->getPointerInfo(),
                          St->isVolatile(), St->isNonTemporal(),
                          St->getAlignment());
    }

    // Otherwise, lower to two pairs of 32-bit loads / stores.
    SDValue LoAddr = Ld->getBasePtr();
    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
                                 DAG.getConstant(4, MVT::i32));

    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
                               Ld->getPointerInfo(),
                               Ld->isVolatile(), Ld->isNonTemporal(),
                               Ld->getAlignment());
    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
                               Ld->getPointerInfo().getWithOffset(4),
                               Ld->isVolatile(), Ld->isNonTemporal(),
                               MinAlign(Ld->getAlignment(), 4));

    SDValue NewChain = LoLd.getValue(1);
    if (TokenFactorIndex != -1) {
      Ops.push_back(LoLd);
      Ops.push_back(HiLd);
      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
                             Ops.size());
    }

    LoAddr = St->getBasePtr();
    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
                         DAG.getConstant(4, MVT::i32));

    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
                                St->getPointerInfo(),
                                St->isVolatile(), St->isNonTemporal(),
                                St->getAlignment());
    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
                                St->getPointerInfo().getWithOffset(4),
                                St->isVolatile(),
                                St->isNonTemporal(),
                                MinAlign(St->getAlignment(), 4));
    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
  }
  return SDValue();
}

/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
/// X86ISD::FXOR nodes.
static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
  // F[X]OR(0.0, x) -> x
  // F[X]OR(x, 0.0) -> x
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
    if (C->getValueAPF().isPosZero())
      return N->getOperand(1);
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
    if (C->getValueAPF().isPosZero())
      return N->getOperand(0);
  return SDValue();
}

/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
  // FAND(0.0, x) -> 0.0
  // FAND(x, 0.0) -> 0.0
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
    if (C->getValueAPF().isPosZero())
      return N->getOperand(0);
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
    if (C->getValueAPF().isPosZero())
      return N->getOperand(1);
  return SDValue();
}

static SDValue PerformBTCombine(SDNode *N,
                                SelectionDAG &DAG,
                                TargetLowering::DAGCombinerInfo &DCI) {
  // BT ignores high bits in the bit index operand.
  SDValue Op1 = N->getOperand(1);
  if (Op1.hasOneUse()) {
    unsigned BitWidth = Op1.getValueSizeInBits();
    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
    APInt KnownZero, KnownOne;
    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                          !DCI.isBeforeLegalizeOps());
    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
      DCI.CommitTargetLoweringOpt(TLO);
  }
  return SDValue();
}

static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
  SDValue Op = N->getOperand(0);
  if (Op.getOpcode() == ISD::BITCAST)
    Op = Op.getOperand(0);
  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
      VT.getVectorElementType().getSizeInBits() ==
      OpVT.getVectorElementType().getSizeInBits()) {
    return DAG.getNode(ISD::BITCAST, N->getDebugLoc(), VT, Op);
  }
  return SDValue();
}

static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
  //           (and (i32 x86isd::setcc_carry), 1)
  // This eliminates the zext. This transformation is necessary because
  // ISD::SETCC is always legalized to i8.
  DebugLoc dl = N->getDebugLoc();
  SDValue N0 = N->getOperand(0);
  EVT VT = N->getValueType(0);
  if (N0.getOpcode() == ISD::AND &&
      N0.hasOneUse() &&
      N0.getOperand(0).hasOneUse()) {
    SDValue N00 = N0.getOperand(0);
    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
      return SDValue();
    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
    if (!C || C->getZExtValue() != 1)
      return SDValue();
    return DAG.getNode(ISD::AND, dl, VT,
                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
                                   N00.getOperand(0), N00.getOperand(1)),
                       DAG.getConstant(1, VT));
  }

  return SDValue();
}

// Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
  unsigned X86CC = N->getConstantOperandVal(0);
  SDValue EFLAG = N->getOperand(1);
  DebugLoc DL = N->getDebugLoc();

  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
  // a zext and produces an all-ones bit which is more useful than 0/1 in some
  // cases.
  if (X86CC == X86::COND_B)
    return DAG.getNode(ISD::AND, DL, MVT::i8,
                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
                                   DAG.getConstant(X86CC, MVT::i8), EFLAG),
                       DAG.getConstant(1, MVT::i8));

  return SDValue();
}

static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                        const X86TargetLowering *XTLI) {
  SDValue Op0 = N->getOperand(0);
  // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
  // a 32-bit target where SSE doesn't support i64->FP operations.
  if (Op0.getOpcode() == ISD::LOAD) {
    LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
    EVT VT = Ld->getValueType(0);
    if (!Ld->isVolatile() && !N->getValueType(0).isVector() &&
        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
        !XTLI->getSubtarget()->is64Bit() &&
        !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
      SDValue FILDChain = XTLI->BuildFILD(SDValue(N, 0), Ld->getValueType(0),
                                          Ld->getChain(), Op0, DAG);
      DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
      return FILDChain;
    }
  }
  return SDValue();
}

// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                 X86TargetLowering::DAGCombinerInfo &DCI) {
  // If the LHS and RHS of the ADC node are zero, then it can't overflow and
  // the result is either zero or one (depending on the input carry bit).
  // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
  if (X86::isZeroNode(N->getOperand(0)) &&
      X86::isZeroNode(N->getOperand(1)) &&
      // We don't have a good way to replace an EFLAGS use, so only do this when
      // dead right now.
      SDValue(N, 1).use_empty()) {
    DebugLoc DL = N->getDebugLoc();
    EVT VT = N->getValueType(0);
    SDValue CarryOut = DAG.getConstant(0, N->getValueType(1));
    SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
                               DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
                                           DAG.getConstant(X86::COND_B,MVT::i8),
                                           N->getOperand(2)),
                               DAG.getConstant(1, VT));
    return DCI.CombineTo(N, Res1, CarryOut);
  }

  return SDValue();
}

// fold (add Y, (sete  X, 0)) -> adc  0, Y
//      (add Y, (setne X, 0)) -> sbb -1, Y
//      (sub (sete  X, 0), Y) -> sbb  0, Y
//      (sub (setne X, 0), Y) -> adc -1, Y
static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
  DebugLoc DL = N->getDebugLoc();

  // Look through ZExts.
  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
    return SDValue();

  SDValue SetCC = Ext.getOperand(0);
  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
    return SDValue();

  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
  if (CC != X86::COND_E && CC != X86::COND_NE)
    return SDValue();

  SDValue Cmp = SetCC.getOperand(1);
  if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
      !X86::isZeroNode(Cmp.getOperand(1)) ||
      !Cmp.getOperand(0).getValueType().isInteger())
    return SDValue();

  SDValue CmpOp0 = Cmp.getOperand(0);
  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
                               DAG.getConstant(1, CmpOp0.getValueType()));

  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
  if (CC == X86::COND_NE)
    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
                       DL, OtherVal.getValueType(), OtherVal,
                       DAG.getConstant(-1ULL, OtherVal.getValueType()), NewCmp);
  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
                     DL, OtherVal.getValueType(), OtherVal,
                     DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
}

static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
  SDValue Op0 = N->getOperand(0);
  SDValue Op1 = N->getOperand(1);

  // X86 can't encode an immediate LHS of a sub. See if we can push the
  // negation into a preceding instruction.
  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
    uint64_t Op0C = C->getSExtValue();

    // If the RHS of the sub is a XOR with one use and a constant, invert the
    // immediate. Then add one to the LHS of the sub so we can turn
    // X-Y -> X+~Y+1, saving one register.
    if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
        isa<ConstantSDNode>(Op1.getOperand(1))) {
      uint64_t XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getSExtValue();
      EVT VT = Op0.getValueType();
      SDValue NewXor = DAG.getNode(ISD::XOR, Op1.getDebugLoc(), VT,
                                   Op1.getOperand(0),
                                   DAG.getConstant(~XorC, VT));
      return DAG.getNode(ISD::ADD, N->getDebugLoc(), VT, NewXor,
                         DAG.getConstant(Op0C+1, VT));
    }
  }

  return OptimizeConditionalInDecrement(N, DAG);
}

SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                             DAGCombinerInfo &DCI) const {
  SelectionDAG &DAG = DCI.DAG;
  switch (N->getOpcode()) {
  default: break;
  case ISD::EXTRACT_VECTOR_ELT:
    return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, *this);
  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
  case ISD::ADD:            return OptimizeConditionalInDecrement(N, DAG);
  case ISD::SUB:            return PerformSubCombine(N, DAG);
  case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
  case ISD::SHL:
  case ISD::SRA:
  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
  case ISD::AND:            return PerformAndCombine(N, DAG, DCI, Subtarget);
  case ISD::OR:             return PerformOrCombine(N, DAG, DCI, Subtarget);
  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
  case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
  case X86ISD::FXOR:
  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
  case X86ISD::SHUFPS:      // Handle all target specific shuffles
  case X86ISD::SHUFPD:
  case X86ISD::PALIGN:
  case X86ISD::PUNPCKHBW:
  case X86ISD::PUNPCKHWD:
  case X86ISD::PUNPCKHDQ:
  case X86ISD::PUNPCKHQDQ:
  case X86ISD::UNPCKHPS:
  case X86ISD::UNPCKHPD:
  case X86ISD::VUNPCKHPSY:
  case X86ISD::VUNPCKHPDY:
  case X86ISD::PUNPCKLBW:
  case X86ISD::PUNPCKLWD:
  case X86ISD::PUNPCKLDQ:
  case X86ISD::PUNPCKLQDQ:
  case X86ISD::UNPCKLPS:
  case X86ISD::UNPCKLPD:
  case X86ISD::VUNPCKLPSY:
  case X86ISD::VUNPCKLPDY:
  case X86ISD::MOVHLPS:
  case X86ISD::MOVLHPS:
  case X86ISD::PSHUFD:
  case X86ISD::PSHUFHW:
  case X86ISD::PSHUFLW:
  case X86ISD::MOVSS:
  case X86ISD::MOVSD:
  case X86ISD::VPERMILPS:
  case X86ISD::VPERMILPSY:
  case X86ISD::VPERMILPD:
  case X86ISD::VPERMILPDY:
  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI);
  }

  return SDValue();
}

/// isTypeDesirableForOp - Return true if the target has native support for
/// the specified value type and it is 'desirable' to use the type for the
/// given node type. e.g. On x86 i16 is legal, but undesirable since i16
/// instruction encodings are longer and some i16 instructions are slow.
bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
  if (!isTypeLegal(VT))
    return false;
  if (VT != MVT::i16)
    return true;

  switch (Opc) {
  default:
    return true;
  case ISD::LOAD:
  case ISD::SIGN_EXTEND:
  case ISD::ZERO_EXTEND:
  case ISD::ANY_EXTEND:
  case ISD::SHL:
  case ISD::SRL:
  case ISD::SUB:
  case ISD::ADD:
  case ISD::MUL:
  case ISD::AND:
  case ISD::OR:
  case ISD::XOR:
    return false;
  }
}

/// IsDesirableToPromoteOp - This method query the target whether it is
/// beneficial for dag combiner to promote the specified node. If true, it
/// should return the desired promotion type by reference.
bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
  EVT VT = Op.getValueType();
  if (VT != MVT::i16)
    return false;

  bool Promote = false;
  bool Commute = false;
  switch (Op.getOpcode()) {
  default: break;
  case ISD::LOAD: {
    LoadSDNode *LD = cast<LoadSDNode>(Op);
    // If the non-extending load has a single use and it's not live out, then it
    // might be folded.
    if (LD->getExtensionType() == ISD::NON_EXTLOAD /*&&
                                                     Op.hasOneUse()*/) {
      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
        // The only case where we'd want to promote LOAD (rather then it being
        // promoted as an operand is when it's only use is liveout.
        if (UI->getOpcode() != ISD::CopyToReg)
          return false;
      }
    }
    Promote = true;
    break;
  }
  case ISD::SIGN_EXTEND:
  case ISD::ZERO_EXTEND:
  case ISD::ANY_EXTEND:
    Promote = true;
    break;
  case ISD::SHL:
  case ISD::SRL: {
    SDValue N0 = Op.getOperand(0);
    // Look out for (store (shl (load), x)).
    if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
      return false;
    Promote = true;
    break;
  }
  case ISD::ADD:
  case ISD::MUL:
  case ISD::AND:
  case ISD::OR:
  case ISD::XOR:
    Commute = true;
    // fallthrough
  case ISD::SUB: {
    SDValue N0 = Op.getOperand(0);
    SDValue N1 = Op.getOperand(1);
    if (!Commute && MayFoldLoad(N1))
      return false;
    // Avoid disabling potential load folding opportunities.
    if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
      return false;
    if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
      return false;
    Promote = true;
  }
  }

  PVT = MVT::i32;
  return Promote;
}

//===----------------------------------------------------------------------===//
//                           X86 Inline Assembly Support
//===----------------------------------------------------------------------===//

bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

  std::string AsmStr = IA->getAsmString();

  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
  SmallVector<StringRef, 4> AsmPieces;
  SplitString(AsmStr, AsmPieces, ";\n");

  switch (AsmPieces.size()) {
  default: return false;
  case 1:
    AsmStr = AsmPieces[0];
    AsmPieces.clear();
    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.

    // FIXME: this should verify that we are targeting a 486 or better.  If not,
    // we will turn this bswap into something that will be lowered to logical ops
    // instead of emitting the bswap asm.  For now, we don't support 486 or lower
    // so don't worry about this.
    // bswap $0
    if (AsmPieces.size() == 2 &&
        (AsmPieces[0] == "bswap" ||
         AsmPieces[0] == "bswapq" ||
         AsmPieces[0] == "bswapl") &&
        (AsmPieces[1] == "$0" ||
         AsmPieces[1] == "${0:q}")) {
      // No need to check constraints, nothing other than the equivalent of
      // "=r,0" would be valid here.
      IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
      if (!Ty || Ty->getBitWidth() % 16 != 0)
        return false;
      return IntrinsicLowering::LowerToByteSwap(CI);
    }
    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
    if (CI->getType()->isIntegerTy(16) &&
        AsmPieces.size() == 3 &&
        (AsmPieces[0] == "rorw" || AsmPieces[0] == "rolw") &&
        AsmPieces[1] == "$$8," &&
        AsmPieces[2] == "${0:w}" &&
        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
      AsmPieces.clear();
      const std::string &ConstraintsStr = IA->getConstraintString();
      SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
      std::sort(AsmPieces.begin(), AsmPieces.end());
      if (AsmPieces.size() == 4 &&
          AsmPieces[0] == "~{cc}" &&
          AsmPieces[1] == "~{dirflag}" &&
          AsmPieces[2] == "~{flags}" &&
          AsmPieces[3] == "~{fpsr}") {
        IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
        if (!Ty || Ty->getBitWidth() % 16 != 0)
          return false;
        return IntrinsicLowering::LowerToByteSwap(CI);
      }
    }
    break;
  case 3:
    if (CI->getType()->isIntegerTy(32) &&
        IA->getConstraintString().compare(0, 5, "=r,0,") == 0) {
      SmallVector<StringRef, 4> Words;
      SplitString(AsmPieces[0], Words, " \t,");