X86ISelLowering.cpp

                                      TII->get(X86::MOV32rm), X86::EAX)
    .addReg(TII->getGlobalBaseReg(F))
    .addImm(0).addReg(0)
    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
                      MI->getOperand(3).getTargetFlags())
    .addReg(0);
    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
    addDirectMem(MIB, X86::EAX);
  }

  MI->eraseFromParent(); // The pseudo instruction is gone now.
  return BB;
}

MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                               MachineBasicBlock *BB) const {
  switch (MI->getOpcode()) {
  default: assert(false && "Unexpected instr type to insert");
  case X86::TAILJMPd64:
  case X86::TAILJMPr64:
  case X86::TAILJMPm64:
    assert(!"TAILJMP64 would not be touched here.");
  case X86::TCRETURNdi64:
  case X86::TCRETURNri64:
  case X86::TCRETURNmi64:
    // Defs of TCRETURNxx64 has Win64's callee-saved registers, as subset.
    // On AMD64, additional defs should be added before register allocation.
    if (!Subtarget->isTargetWin64()) {
      MI->addRegisterDefined(X86::RSI);
      MI->addRegisterDefined(X86::RDI);
      MI->addRegisterDefined(X86::XMM6);
      MI->addRegisterDefined(X86::XMM7);
      MI->addRegisterDefined(X86::XMM8);
      MI->addRegisterDefined(X86::XMM9);
      MI->addRegisterDefined(X86::XMM10);
      MI->addRegisterDefined(X86::XMM11);
      MI->addRegisterDefined(X86::XMM12);
      MI->addRegisterDefined(X86::XMM13);
      MI->addRegisterDefined(X86::XMM14);
      MI->addRegisterDefined(X86::XMM15);
    }
    return BB;
  case X86::WIN_ALLOCA:
    return EmitLoweredWinAlloca(MI, BB);
  case X86::TLSCall_32:
  case X86::TLSCall_64:
    return EmitLoweredTLSCall(MI, BB);
  case X86::CMOV_GR8:
  case X86::CMOV_FR32:
  case X86::CMOV_FR64:
  case X86::CMOV_V4F32:
  case X86::CMOV_V2F64:
  case X86::CMOV_V2I64:
  case X86::CMOV_GR16:
  case X86::CMOV_GR32:
  case X86::CMOV_RFP32:
  case X86::CMOV_RFP64:
  case X86::CMOV_RFP80:
    return EmitLoweredSelect(MI, BB);

  case X86::FP32_TO_INT16_IN_MEM:
  case X86::FP32_TO_INT32_IN_MEM:
  case X86::FP32_TO_INT64_IN_MEM:
  case X86::FP64_TO_INT16_IN_MEM:
  case X86::FP64_TO_INT32_IN_MEM:
  case X86::FP64_TO_INT64_IN_MEM:
  case X86::FP80_TO_INT16_IN_MEM:
  case X86::FP80_TO_INT32_IN_MEM:
  case X86::FP80_TO_INT64_IN_MEM: {
    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
    DebugLoc DL = MI->getDebugLoc();

    // Change the floating point control register to use "round towards zero"
    // mode when truncating to an integer value.
    MachineFunction *F = BB->getParent();
    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
    addFrameReference(BuildMI(*BB, MI, DL,
                              TII->get(X86::FNSTCW16m)), CWFrameIdx);

    // Load the old value of the high byte of the control word...
    unsigned OldCW =
      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
                      CWFrameIdx);

    // Set the high part to be round to zero...
    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
      .addImm(0xC7F);

    // Reload the modified control word now...
    addFrameReference(BuildMI(*BB, MI, DL,
                              TII->get(X86::FLDCW16m)), CWFrameIdx);

    // Restore the memory image of control word to original value
    addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
      .addReg(OldCW);

    // Get the X86 opcode to use.
    unsigned Opc;
    switch (MI->getOpcode()) {
    default: llvm_unreachable("illegal opcode!");
    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
    }

    X86AddressMode AM;
    MachineOperand &Op = MI->getOperand(0);
    if (Op.isReg()) {
      AM.BaseType = X86AddressMode::RegBase;
      AM.Base.Reg = Op.getReg();
    } else {
      AM.BaseType = X86AddressMode::FrameIndexBase;
      AM.Base.FrameIndex = Op.getIndex();
    }
    Op = MI->getOperand(1);
    if (Op.isImm())
      AM.Scale = Op.getImm();
    Op = MI->getOperand(2);
    if (Op.isImm())
      AM.IndexReg = Op.getImm();
    Op = MI->getOperand(3);
    if (Op.isGlobal()) {
      AM.GV = Op.getGlobal();
    } else {
      AM.Disp = Op.getImm();
    }
    addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
                      .addReg(MI->getOperand(X86::AddrNumOperands).getReg());

    // Reload the original control word now.
    addFrameReference(BuildMI(*BB, MI, DL,
                              TII->get(X86::FLDCW16m)), CWFrameIdx);

    MI->eraseFromParent();   // The pseudo instruction is gone now.
    return BB;
  }
    // String/text processing lowering.
  case X86::PCMPISTRM128REG:
  case X86::VPCMPISTRM128REG:
    return EmitPCMP(MI, BB, 3, false /* in-mem */);
  case X86::PCMPISTRM128MEM:
  case X86::VPCMPISTRM128MEM:
    return EmitPCMP(MI, BB, 3, true /* in-mem */);
  case X86::PCMPESTRM128REG:
  case X86::VPCMPESTRM128REG:
    return EmitPCMP(MI, BB, 5, false /* in mem */);
  case X86::PCMPESTRM128MEM:
  case X86::VPCMPESTRM128MEM:
    return EmitPCMP(MI, BB, 5, true /* in mem */);

    // Thread synchronization.
  case X86::MONITOR:
    return EmitMonitor(MI, BB);
  case X86::MWAIT:
    return EmitMwait(MI, BB);

    // Atomic Lowering.
  case X86::ATOMAND32:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
                                               X86::AND32ri, X86::MOV32rm,
                                               X86::LCMPXCHG32,
                                               X86::NOT32r, X86::EAX,
                                               X86::GR32RegisterClass);
  case X86::ATOMOR32:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
                                               X86::OR32ri, X86::MOV32rm,
                                               X86::LCMPXCHG32,
                                               X86::NOT32r, X86::EAX,
                                               X86::GR32RegisterClass);
  case X86::ATOMXOR32:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
                                               X86::XOR32ri, X86::MOV32rm,
                                               X86::LCMPXCHG32,
                                               X86::NOT32r, X86::EAX,
                                               X86::GR32RegisterClass);
  case X86::ATOMNAND32:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
                                               X86::AND32ri, X86::MOV32rm,
                                               X86::LCMPXCHG32,
                                               X86::NOT32r, X86::EAX,
                                               X86::GR32RegisterClass, true);
  case X86::ATOMMIN32:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
  case X86::ATOMMAX32:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
  case X86::ATOMUMIN32:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
  case X86::ATOMUMAX32:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);

  case X86::ATOMAND16:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
                                               X86::AND16ri, X86::MOV16rm,
                                               X86::LCMPXCHG16,
                                               X86::NOT16r, X86::AX,
                                               X86::GR16RegisterClass);
  case X86::ATOMOR16:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
                                               X86::OR16ri, X86::MOV16rm,
                                               X86::LCMPXCHG16,
                                               X86::NOT16r, X86::AX,
                                               X86::GR16RegisterClass);
  case X86::ATOMXOR16:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
                                               X86::XOR16ri, X86::MOV16rm,
                                               X86::LCMPXCHG16,
                                               X86::NOT16r, X86::AX,
                                               X86::GR16RegisterClass);
  case X86::ATOMNAND16:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
                                               X86::AND16ri, X86::MOV16rm,
                                               X86::LCMPXCHG16,
                                               X86::NOT16r, X86::AX,
                                               X86::GR16RegisterClass, true);
  case X86::ATOMMIN16:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
  case X86::ATOMMAX16:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
  case X86::ATOMUMIN16:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
  case X86::ATOMUMAX16:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);

  case X86::ATOMAND8:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
                                               X86::AND8ri, X86::MOV8rm,
                                               X86::LCMPXCHG8,
                                               X86::NOT8r, X86::AL,
                                               X86::GR8RegisterClass);
  case X86::ATOMOR8:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
                                               X86::OR8ri, X86::MOV8rm,
                                               X86::LCMPXCHG8,
                                               X86::NOT8r, X86::AL,
                                               X86::GR8RegisterClass);
  case X86::ATOMXOR8:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
                                               X86::XOR8ri, X86::MOV8rm,
                                               X86::LCMPXCHG8,
                                               X86::NOT8r, X86::AL,
                                               X86::GR8RegisterClass);
  case X86::ATOMNAND8:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
                                               X86::AND8ri, X86::MOV8rm,
                                               X86::LCMPXCHG8,
                                               X86::NOT8r, X86::AL,
                                               X86::GR8RegisterClass, true);
  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
  // This group is for 64-bit host.
  case X86::ATOMAND64:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
                                               X86::AND64ri32, X86::MOV64rm,
                                               X86::LCMPXCHG64,
                                               X86::NOT64r, X86::RAX,
                                               X86::GR64RegisterClass);
  case X86::ATOMOR64:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
                                               X86::OR64ri32, X86::MOV64rm,
                                               X86::LCMPXCHG64,
                                               X86::NOT64r, X86::RAX,
                                               X86::GR64RegisterClass);
  case X86::ATOMXOR64:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
                                               X86::XOR64ri32, X86::MOV64rm,
                                               X86::LCMPXCHG64,
                                               X86::NOT64r, X86::RAX,
                                               X86::GR64RegisterClass);
  case X86::ATOMNAND64:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
                                               X86::AND64ri32, X86::MOV64rm,
                                               X86::LCMPXCHG64,
                                               X86::NOT64r, X86::RAX,
                                               X86::GR64RegisterClass, true);
  case X86::ATOMMIN64:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
  case X86::ATOMMAX64:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
  case X86::ATOMUMIN64:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
  case X86::ATOMUMAX64:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);

  // This group does 64-bit operations on a 32-bit host.
  case X86::ATOMAND6432:
    return EmitAtomicBit6432WithCustomInserter(MI, BB,
                                               X86::AND32rr, X86::AND32rr,
                                               X86::AND32ri, X86::AND32ri,
                                               false);
  case X86::ATOMOR6432:
    return EmitAtomicBit6432WithCustomInserter(MI, BB,
                                               X86::OR32rr, X86::OR32rr,
                                               X86::OR32ri, X86::OR32ri,
                                               false);
  case X86::ATOMXOR6432:
    return EmitAtomicBit6432WithCustomInserter(MI, BB,
                                               X86::XOR32rr, X86::XOR32rr,
                                               X86::XOR32ri, X86::XOR32ri,
                                               false);
  case X86::ATOMNAND6432:
    return EmitAtomicBit6432WithCustomInserter(MI, BB,
                                               X86::AND32rr, X86::AND32rr,
                                               X86::AND32ri, X86::AND32ri,
                                               true);
  case X86::ATOMADD6432:
    return EmitAtomicBit6432WithCustomInserter(MI, BB,
                                               X86::ADD32rr, X86::ADC32rr,
                                               X86::ADD32ri, X86::ADC32ri,
                                               false);
  case X86::ATOMSUB6432:
    return EmitAtomicBit6432WithCustomInserter(MI, BB,
                                               X86::SUB32rr, X86::SBB32rr,
                                               X86::SUB32ri, X86::SBB32ri,
                                               false);
  case X86::ATOMSWAP6432:
    return EmitAtomicBit6432WithCustomInserter(MI, BB,
                                               X86::MOV32rr, X86::MOV32rr,
                                               X86::MOV32ri, X86::MOV32ri,
                                               false);
  case X86::VASTART_SAVE_XMM_REGS:
    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

  case X86::VAARG_64:
    return EmitVAARG64WithCustomInserter(MI, BB);
  }
}

//===----------------------------------------------------------------------===//
//                           X86 Optimization Hooks
//===----------------------------------------------------------------------===//

void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
                                                       const APInt &Mask,
                                                       APInt &KnownZero,
                                                       APInt &KnownOne,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
  unsigned Opc = Op.getOpcode();
  assert((Opc >= ISD::BUILTIN_OP_END ||
          Opc == ISD::INTRINSIC_WO_CHAIN ||
          Opc == ISD::INTRINSIC_W_CHAIN ||
          Opc == ISD::INTRINSIC_VOID) &&
         "Should use MaskedValueIsZero if you don't know whether Op"
         " is a target node!");

  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
  switch (Opc) {
  default: break;
  case X86ISD::ADD:
  case X86ISD::SUB:
  case X86ISD::ADC:
  case X86ISD::SBB:
  case X86ISD::SMUL:
  case X86ISD::UMUL:
  case X86ISD::INC:
  case X86ISD::DEC:
  case X86ISD::OR:
  case X86ISD::XOR:
  case X86ISD::AND:
    // These nodes' second result is a boolean.
    if (Op.getResNo() == 0)
      break;
    // Fallthrough
  case X86ISD::SETCC:
    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
                                       Mask.getBitWidth() - 1);
    break;
  }
}

unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
                                                         unsigned Depth) const {
  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
    return Op.getValueType().getScalarType().getSizeInBits();

  // Fallback case.
  return 1;
}

/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
/// node is a GlobalAddress + offset.
bool X86TargetLowering::isGAPlusOffset(SDNode *N,
                                       const GlobalValue* &GA,
                                       int64_t &Offset) const {
  if (N->getOpcode() == X86ISD::Wrapper) {
    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
      return true;
    }
  }
  return TargetLowering::isGAPlusOffset(N, GA, Offset);
}

/// PerformShuffleCombine256 - Performs shuffle combines for 256-bit vectors.
static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
                                        TargetLowering::DAGCombinerInfo &DCI) {
  DebugLoc dl = N->getDebugLoc();
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  SDValue V1 = SVOp->getOperand(0);
  SDValue V2 = SVOp->getOperand(1);
  EVT VT = SVOp->getValueType(0);

  if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
      V2.getOpcode() == ISD::CONCAT_VECTORS) {
    //
    //                   0,0,0,...
    //                      |
    //    V      UNDEF    BUILD_VECTOR    UNDEF
    //     \      /           \           /
    //  CONCAT_VECTOR         CONCAT_VECTOR
    //         \                  /
    //          \                /
    //          RESULT: V + zero extended
    //
    if (V2.getOperand(0).getOpcode() != ISD::BUILD_VECTOR ||
        V2.getOperand(1).getOpcode() != ISD::UNDEF ||
        V1.getOperand(1).getOpcode() != ISD::UNDEF)
      return SDValue();

    if (!ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()))
      return SDValue();

    // To match the shuffle mask, the first half of the mask should
    // be exactly the first vector, and all the rest a splat with the
    // first element of the second one.
    int NumElems = VT.getVectorNumElements();
    for (int i = 0; i < NumElems/2; ++i)
      if (!isUndefOrEqual(SVOp->getMaskElt(i), i) ||
          !isUndefOrEqual(SVOp->getMaskElt(i+NumElems/2), NumElems))
        return SDValue();

    // Emit a zeroed vector and insert the desired subvector on its
    // first half.
    SDValue Zeros = getZeroVector(VT, true /* HasSSE2 */, DAG, dl);
    SDValue InsV = Insert128BitVector(Zeros, V1.getOperand(0),
                         DAG.getConstant(0, MVT::i32), DAG, dl);
    return DCI.CombineTo(N, InsV);
  }

  return SDValue();
}

/// PerformShuffleCombine - Performs several different shuffle combines.
static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI) {
  DebugLoc dl = N->getDebugLoc();
  EVT VT = N->getValueType(0);

  // Don't create instructions with illegal types after legalize types has run.
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
    return SDValue();

  // Only handle pure VECTOR_SHUFFLE nodes.
  if (VT.getSizeInBits() == 256 && N->getOpcode() == ISD::VECTOR_SHUFFLE)
    return PerformShuffleCombine256(N, DAG, DCI);

  // Only handle 128 wide vector from here on.
  if (VT.getSizeInBits() != 128)
    return SDValue();

  // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
  // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
  // consecutive, non-overlapping, and in the right order.
  SmallVector<SDValue, 16> Elts;
  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));

  return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
}

/// PerformEXTRACT_VECTOR_ELTCombine - Detect vector gather/scatter index
/// generation and convert it from being a bunch of shuffles and extracts
/// to a simple store and scalar loads to extract the elements.
static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
                                                const TargetLowering &TLI) {
  SDValue InputVector = N->getOperand(0);

  // Only operate on vectors of 4 elements, where the alternative shuffling
  // gets to be more expensive.
  if (InputVector.getValueType() != MVT::v4i32)
    return SDValue();

  // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
  // single use which is a sign-extend or zero-extend, and all elements are
  // used.
  SmallVector<SDNode *, 4> Uses;
  unsigned ExtractedElements = 0;
  for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
       UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
    if (UI.getUse().getResNo() != InputVector.getResNo())
      return SDValue();

    SDNode *Extract = *UI;
    if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
      return SDValue();

    if (Extract->getValueType(0) != MVT::i32)
      return SDValue();
    if (!Extract->hasOneUse())
      return SDValue();
    if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
        Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
      return SDValue();
    if (!isa<ConstantSDNode>(Extract->getOperand(1)))
      return SDValue();

    // Record which element was extracted.
    ExtractedElements |=
      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();

    Uses.push_back(Extract);
  }

  // If not all the elements were used, this may not be worthwhile.
  if (ExtractedElements != 15)
    return SDValue();

  // Ok, we've now decided to do the transformation.
  DebugLoc dl = InputVector.getDebugLoc();

  // Store the value to a temporary stack slot.
  SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
                            MachinePointerInfo(), false, false, 0);

  // Replace each use (extract) with a load of the appropriate element.
  for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
       UE = Uses.end(); UI != UE; ++UI) {
    SDNode *Extract = *UI;

    // cOMpute the element's address.
    SDValue Idx = Extract->getOperand(1);
    unsigned EltSize =
        InputVector.getValueType().getVectorElementType().getSizeInBits()/8;
    uint64_t Offset = EltSize * cast<ConstantSDNode>(Idx)->getZExtValue();
    SDValue OffsetVal = DAG.getConstant(Offset, TLI.getPointerTy());

    SDValue ScalarAddr = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(),
                                     StackPtr, OffsetVal);

    // Load the scalar.
    SDValue LoadScalar = DAG.getLoad(Extract->getValueType(0), dl, Ch,
                                     ScalarAddr, MachinePointerInfo(),
                                     false, false, 0);

    // Replace the exact with the load.
    DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), LoadScalar);
  }

  // The replacement was made in place; don't return anything.
  return SDValue();
}

/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget *Subtarget) {
  DebugLoc DL = N->getDebugLoc();
  SDValue Cond = N->getOperand(0);
  // Get the LHS/RHS of the select.
  SDValue LHS = N->getOperand(1);
  SDValue RHS = N->getOperand(2);

  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
  // instructions match the semantics of the common C idiom x<y?x:y but not
  // x<=y?x:y, because of how they handle negative zero (which can be
  // ignored in unsafe-math mode).
  if (Subtarget->hasSSE2() &&
      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
      Cond.getOpcode() == ISD::SETCC) {
    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

    unsigned Opcode = 0;
    // Check for x CC y ? x : y.
    if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
        DAG.isEqualTo(RHS, Cond.getOperand(1))) {
      switch (CC) {
      default: break;
      case ISD::SETULT:
        // Converting this to a min would handle NaNs incorrectly, and swapping
        // the operands would cause it to handle comparisons between positive
        // and negative zero incorrectly.
        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
          if (!UnsafeFPMath &&
              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
            break;
          std::swap(LHS, RHS);
        }
        Opcode = X86ISD::FMIN;
        break;
      case ISD::SETOLE:
        // Converting this to a min would handle comparisons between positive
        // and negative zero incorrectly.
        if (!UnsafeFPMath &&
            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
          break;
        Opcode = X86ISD::FMIN;
        break;
      case ISD::SETULE:
        // Converting this to a min would handle both negative zeros and NaNs
        // incorrectly, but we can swap the operands to fix both.
        std::swap(LHS, RHS);
      case ISD::SETOLT:
      case ISD::SETLT:
      case ISD::SETLE:
        Opcode = X86ISD::FMIN;
        break;

      case ISD::SETOGE:
        // Converting this to a max would handle comparisons between positive
        // and negative zero incorrectly.
        if (!UnsafeFPMath &&
            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(LHS))
          break;
        Opcode = X86ISD::FMAX;
        break;
      case ISD::SETUGT:
        // Converting this to a max would handle NaNs incorrectly, and swapping
        // the operands would cause it to handle comparisons between positive
        // and negative zero incorrectly.
        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
          if (!UnsafeFPMath &&
              !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
            break;
          std::swap(LHS, RHS);
        }
        Opcode = X86ISD::FMAX;
        break;
      case ISD::SETUGE:
        // Converting this to a max would handle both negative zeros and NaNs
        // incorrectly, but we can swap the operands to fix both.
        std::swap(LHS, RHS);
      case ISD::SETOGT:
      case ISD::SETGT:
      case ISD::SETGE:
        Opcode = X86ISD::FMAX;
        break;
      }
    // Check for x CC y ? y : x -- a min/max with reversed arms.
    } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
               DAG.isEqualTo(RHS, Cond.getOperand(0))) {
      switch (CC) {
      default: break;
      case ISD::SETOGE:
        // Converting this to a min would handle comparisons between positive
        // and negative zero incorrectly, and swapping the operands would
        // cause it to handle NaNs incorrectly.
        if (!UnsafeFPMath &&
            !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
            break;
          std::swap(LHS, RHS);
        }
        Opcode = X86ISD::FMIN;
        break;
      case ISD::SETUGT:
        // Converting this to a min would handle NaNs incorrectly.
        if (!UnsafeFPMath &&
            (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
          break;
        Opcode = X86ISD::FMIN;
        break;
      case ISD::SETUGE:
        // Converting this to a min would handle both negative zeros and NaNs
        // incorrectly, but we can swap the operands to fix both.
        std::swap(LHS, RHS);
      case ISD::SETOGT:
      case ISD::SETGT:
      case ISD::SETGE:
        Opcode = X86ISD::FMIN;
        break;

      case ISD::SETULT:
        // Converting this to a max would handle NaNs incorrectly.
        if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
          break;
        Opcode = X86ISD::FMAX;
        break;
      case ISD::SETOLE:
        // Converting this to a max would handle comparisons between positive
        // and negative zero incorrectly, and swapping the operands would
        // cause it to handle NaNs incorrectly.
        if (!UnsafeFPMath &&
            !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
          if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
            break;
          std::swap(LHS, RHS);
        }
        Opcode = X86ISD::FMAX;
        break;
      case ISD::SETULE:
        // Converting this to a max would handle both negative zeros and NaNs
        // incorrectly, but we can swap the operands to fix both.
        std::swap(LHS, RHS);
      case ISD::SETOLT:
      case ISD::SETLT:
      case ISD::SETLE:
        Opcode = X86ISD::FMAX;
        break;
      }
    }

    if (Opcode)
      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
  }

  // If this is a select between two integer constants, try to do some
  // optimizations.
  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
      // Don't do this for crazy integer types.
      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
        // so that TrueC (the true value) is larger than FalseC.
        bool NeedsCondInvert = false;

        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
            // Efficiently invertible.
            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
              isa<ConstantSDNode>(Cond.getOperand(1))))) {
          NeedsCondInvert = true;
          std::swap(TrueC, FalseC);
        }

        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
        if (FalseC->getAPIntValue() == 0 &&
            TrueC->getAPIntValue().isPowerOf2()) {
          if (NeedsCondInvert) // Invert the condition if needed.
            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
                               DAG.getConstant(1, Cond.getValueType()));

          // Zero extend the condition if needed.
          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);

          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
                             DAG.getConstant(ShAmt, MVT::i8));
        }

        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
          if (NeedsCondInvert) // Invert the condition if needed.
            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
                               DAG.getConstant(1, Cond.getValueType()));

          // Zero extend the condition if needed.
          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
                             FalseC->getValueType(0), Cond);
          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                             SDValue(FalseC, 0));
        }

        // Optimize cases that will turn into an LEA instruction.  This requires
        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

          bool isFastMultiplier = false;
          if (Diff < 10) {
            switch ((unsigned char)Diff) {
              default: break;
              case 1:  // result = add base, cond
              case 2:  // result = lea base(    , cond*2)
              case 3:  // result = lea base(cond, cond*2)
              case 4:  // result = lea base(    , cond*4)
              case 5:  // result = lea base(cond, cond*4)
              case 8:  // result = lea base(    , cond*8)
              case 9:  // result = lea base(cond, cond*8)
                isFastMultiplier = true;
                break;
            }
          }

          if (isFastMultiplier) {
            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
            if (NeedsCondInvert) // Invert the condition if needed.
              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
                                 DAG.getConstant(1, Cond.getValueType()));

            // Zero extend the condition if needed.
            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
                               Cond);
            // Scale the condition by the difference.
            if (Diff != 1)
              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
                                 DAG.getConstant(Diff, Cond.getValueType()));

            // Add the base if non-zero.
            if (FalseC->getAPIntValue() != 0)
              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                                 SDValue(FalseC, 0));
            return Cond;
          }
        }
      }
  }

  return SDValue();
}

/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI) {
  DebugLoc DL = N->getDebugLoc();

  // If the flag operand isn't dead, don't touch this CMOV.
  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
    return SDValue();

  SDValue FalseOp = N->getOperand(0);
  SDValue TrueOp = N->getOperand(1);
  X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
  SDValue Cond = N->getOperand(3);
  if (CC == X86::COND_E || CC == X86::COND_NE) {
    switch (Cond.getOpcode()) {
    default: break;
    case X86ISD::BSR:
    case X86ISD::BSF:
      // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
      if (DAG.isKnownNeverZero(Cond.getOperand(0)))
        return (CC == X86::COND_E) ? FalseOp : TrueOp;
    }
  }

  // If this is a select between two integer constants, try to do some
  // optimizations.  Note that the operands are ordered the opposite of SELECT
  // operands.
  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
      // larger than FalseC (the false value).
      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
        CC = X86::GetOppositeBranchCondition(CC);
        std::swap(TrueC, FalseC);
      }

      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
      // This is efficient for any integer data type (including i8/i16) and
      // shift amount.
      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                           DAG.getConstant(CC, MVT::i8), Cond);

        // Zero extend the condition if needed.
        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
                           DAG.getConstant(ShAmt, MVT::i8));
        if (N->getNumValues() == 2)  // Dead flag value?
          return DCI.CombineTo(N, Cond, SDValue());
        return Cond;
      }

      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
      // for any integer data type, including i8/i16.
      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                           DAG.getConstant(CC, MVT::i8), Cond);

        // Zero extend the condition if needed.
        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
                           FalseC->getValueType(0), Cond);
        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                           SDValue(FalseC, 0));

        if (N->getNumValues() == 2)  // Dead flag value?
          return DCI.CombineTo(N, Cond, SDValue());
        return Cond;
      }

      // Optimize cases that will turn into an LEA instruction.  This requires
      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

        bool isFastMultiplier = false;
        if (Diff < 10) {
          switch ((unsigned char)Diff) {
          default: break;
          case 1:  // result = add base, cond
          case 2:  // result = lea base(    , cond*2)
          case 3:  // result = lea base(cond, cond*2)
          case 4:  // result = lea base(    , cond*4)
          case 5:  // result = lea base(cond, cond*4)
          case 8:  // result = lea base(    , cond*8)
          case 9:  // result = lea base(cond, cond*8)
            isFastMultiplier = true;
            break;
          }
        }

        if (isFastMultiplier) {
          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
                             DAG.getConstant(CC, MVT::i8), Cond);
          // Zero extend the condition if needed.
          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
                             Cond);
          // Scale the condition by the difference.
          if (Diff != 1)
            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
                               DAG.getConstant(Diff, Cond.getValueType()));

          // Add the base if non-zero.
          if (FalseC->getAPIntValue() != 0)
            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
                               SDValue(FalseC, 0));
          if (N->getNumValues() == 2)  // Dead flag value?
            return DCI.CombineTo(N, Cond, SDValue());
          return Cond;
        }
      }
    }
  }
  return SDValue();
}


/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.
static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
                                 TargetLowering::DAGCombinerInfo &DCI) {
  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
    return SDValue();

  EVT VT = N->getValueType(0);
  if (VT != MVT::i64)
    return SDValue();

  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
  if (!C)
    return SDValue();
  uint64_t MulAmt = C->getZExtValue();
  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
    return SDValue();

  uint64_t MulAmt1 = 0;
  uint64_t MulAmt2 = 0;
  if ((MulAmt % 9) == 0) {
    MulAmt1 = 9;
    MulAmt2 = MulAmt / 9;
  } else if ((MulAmt % 5) == 0) {
    MulAmt1 = 5;
    MulAmt2 = MulAmt / 5;
  } else if ((MulAmt % 3) == 0) {
    MulAmt1 = 3;
    MulAmt2 = MulAmt / 3;
  }
  if (MulAmt2 &&
      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
    DebugLoc DL = N->getDebugLoc();

    if (isPowerOf2_64(MulAmt2) &&
        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
      // If second multiplifer is pow2, issue it first. We want the multiply by
      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
      // is an add.
      std::swap(MulAmt1, MulAmt2);

    SDValue NewMul;
    if (isPowerOf2_64(MulAmt1))
      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
    else
      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
                           DAG.getConstant(MulAmt1, VT));

    if (isPowerOf2_64(MulAmt2))
      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
    else
      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                           DAG.getConstant(MulAmt2, VT));

    // Do not add new nodes to DAG combiner worklist.
    DCI.CombineTo(N, NewMul, false);
  }
  return SDValue();
}

static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
  SDValue N0 = N->getOperand(0);
  SDValue N1 = N->getOperand(1);
  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
  EVT VT = N0.getValueType();