X86ISelDAGToDAG.cpp

//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines a DAG pattern matching instruction selector for X86,
// converting from a legalized dag to a X86 dag.
//
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "x86-isel"
#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86ISelLowering.h"
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/GlobalValue.h"
#include "llvm/Instructions.h"
#include "llvm/Intrinsics.h"
#include "llvm/Support/CFG.h"
#include "llvm/Type.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
using namespace llvm;

#include "llvm/Support/CommandLine.h"
static cl::opt<bool> AvoidDupAddrCompute("x86-avoid-dup-address", cl::Hidden);

STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");

//===----------------------------------------------------------------------===//
//                      Pattern Matcher Implementation
//===----------------------------------------------------------------------===//

namespace {
  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
  /// SDValue's instead of register numbers for the leaves of the matched
  /// tree.
  struct X86ISelAddressMode {
    enum {
      RegBase,
      FrameIndexBase
    } BaseType;

    struct {            // This is really a union, discriminated by BaseType!
      SDValue Reg;
      int FrameIndex;
    } Base;

    unsigned Scale;
    SDValue IndexReg; 
    int32_t Disp;
    SDValue Segment;
    GlobalValue *GV;
    Constant *CP;
    const char *ES;
    int JT;
    unsigned Align;    // CP alignment.
    unsigned char SymbolFlags;  // X86II::MO_*

    X86ISelAddressMode()
      : BaseType(RegBase), Scale(1), IndexReg(), Disp(0),
        Segment(), GV(0), CP(0), ES(0), JT(-1), Align(0),
        SymbolFlags(X86II::MO_NO_FLAG) {
    }

    bool hasSymbolicDisplacement() const {
      return GV != 0 || CP != 0 || ES != 0 || JT != -1;
    }
    
    bool hasBaseOrIndexReg() const {
      return IndexReg.getNode() != 0 || Base.Reg.getNode() != 0;
    }
    
    /// isRIPRelative - Return true if this addressing mode is already RIP
    /// relative.
    bool isRIPRelative() const {
      if (BaseType != RegBase) return false;
      if (RegisterSDNode *RegNode =
            dyn_cast_or_null<RegisterSDNode>(Base.Reg.getNode()))
        return RegNode->getReg() == X86::RIP;
      return false;
    }
    
    void setBaseReg(SDValue Reg) {
      BaseType = RegBase;
      Base.Reg = Reg;
    }

    void dump() {
      errs() << "X86ISelAddressMode " << this << '\n';
      errs() << "Base.Reg ";
      if (Base.Reg.getNode() != 0)
        Base.Reg.getNode()->dump(); 
      else
        errs() << "nul";
      errs() << " Base.FrameIndex " << Base.FrameIndex << '\n'
             << " Scale" << Scale << '\n'
             << "IndexReg ";
      if (IndexReg.getNode() != 0)
        IndexReg.getNode()->dump();
      else
        errs() << "nul"; 
      errs() << " Disp " << Disp << '\n'
             << "GV ";
      if (GV)
        GV->dump();
      else
        errs() << "nul";
      errs() << " CP ";
      if (CP)
        CP->dump();
      else
        errs() << "nul";
      errs() << '\n'
             << "ES ";
      if (ES)
        errs() << ES;
      else
        errs() << "nul";
      errs() << " JT" << JT << " Align" << Align << '\n';
    }
  };
}

namespace {
  //===--------------------------------------------------------------------===//
  /// ISel - X86 specific code to select X86 machine instructions for
  /// SelectionDAG operations.
  ///
  class VISIBILITY_HIDDEN X86DAGToDAGISel : public SelectionDAGISel {
    /// X86Lowering - This object fully describes how to lower LLVM code to an
    /// X86-specific SelectionDAG.
    X86TargetLowering &X86Lowering;

    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
    /// make the right decision when generating code for different targets.
    const X86Subtarget *Subtarget;

    /// OptForSize - If true, selector should try to optimize for code size
    /// instead of performance.
    bool OptForSize;

  public:
    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
      : SelectionDAGISel(tm, OptLevel),
        X86Lowering(*tm.getTargetLowering()),
        Subtarget(&tm.getSubtarget<X86Subtarget>()),
        OptForSize(false) {}

    virtual const char *getPassName() const {
      return "X86 DAG->DAG Instruction Selection";
    }

    /// InstructionSelect - This callback is invoked by
    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
    virtual void InstructionSelect();

    virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);

    virtual
      bool IsLegalAndProfitableToFold(SDNode *N, SDNode *U, SDNode *Root) const;

// Include the pieces autogenerated from the target description.
#include "X86GenDAGISel.inc"

  private:
    SDNode *Select(SDValue N);
    SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
    SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);

    bool MatchSegmentBaseAddress(SDValue N, X86ISelAddressMode &AM);
    bool MatchLoad(SDValue N, X86ISelAddressMode &AM);
    bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
    bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
    bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                 unsigned Depth);
    bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
    bool SelectAddr(SDValue Op, SDValue N, SDValue &Base,
                    SDValue &Scale, SDValue &Index, SDValue &Disp,
                    SDValue &Segment);
    bool SelectLEAAddr(SDValue Op, SDValue N, SDValue &Base,
                       SDValue &Scale, SDValue &Index, SDValue &Disp);
    bool SelectTLSADDRAddr(SDValue Op, SDValue N, SDValue &Base,
                       SDValue &Scale, SDValue &Index, SDValue &Disp);
    bool SelectScalarSSELoad(SDValue Op, SDValue Pred,
                             SDValue N, SDValue &Base, SDValue &Scale,
                             SDValue &Index, SDValue &Disp,
                             SDValue &Segment,
                             SDValue &InChain, SDValue &OutChain);
    bool TryFoldLoad(SDValue P, SDValue N,
                     SDValue &Base, SDValue &Scale,
                     SDValue &Index, SDValue &Disp,
                     SDValue &Segment);
    void PreprocessForRMW();
    void PreprocessForFPConvert();

    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
    /// inline asm expressions.
    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                              char ConstraintCode,
                                              std::vector<SDValue> &OutOps);
    
    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);

    inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, 
                                   SDValue &Scale, SDValue &Index,
                                   SDValue &Disp, SDValue &Segment) {
      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
        CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
        AM.Base.Reg;
      Scale = getI8Imm(AM.Scale);
      Index = AM.IndexReg;
      // These are 32-bit even in 64-bit mode since RIP relative offset
      // is 32-bit.
      if (AM.GV)
        Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp,
                                              AM.SymbolFlags);
      else if (AM.CP)
        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
                                             AM.Align, AM.Disp, AM.SymbolFlags);
      else if (AM.ES)
        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
      else if (AM.JT != -1)
        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
      else
        Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);

      if (AM.Segment.getNode())
        Segment = AM.Segment;
      else
        Segment = CurDAG->getRegister(0, MVT::i32);
    }

    /// getI8Imm - Return a target constant with the specified value, of type
    /// i8.
    inline SDValue getI8Imm(unsigned Imm) {
      return CurDAG->getTargetConstant(Imm, MVT::i8);
    }

    /// getI16Imm - Return a target constant with the specified value, of type
    /// i16.
    inline SDValue getI16Imm(unsigned Imm) {
      return CurDAG->getTargetConstant(Imm, MVT::i16);
    }

    /// getI32Imm - Return a target constant with the specified value, of type
    /// i32.
    inline SDValue getI32Imm(unsigned Imm) {
      return CurDAG->getTargetConstant(Imm, MVT::i32);
    }

    /// getGlobalBaseReg - Return an SDNode that returns the value of
    /// the global base register. Output instructions required to
    /// initialize the global base register, if necessary.
    ///
    SDNode *getGlobalBaseReg();

    /// getTargetMachine - Return a reference to the TargetMachine, casted
    /// to the target-specific type.
    const X86TargetMachine &getTargetMachine() {
      return static_cast<const X86TargetMachine &>(TM);
    }

    /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
    /// to the target-specific type.
    const X86InstrInfo *getInstrInfo() {
      return getTargetMachine().getInstrInfo();
    }

#ifndef NDEBUG
    unsigned Indent;
#endif
  };
}


bool X86DAGToDAGISel::IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
                                                 SDNode *Root) const {
  if (OptLevel == CodeGenOpt::None) return false;

  if (U == Root)
    switch (U->getOpcode()) {
    default: break;
    case ISD::ADD:
    case ISD::ADDC:
    case ISD::ADDE:
    case ISD::AND:
    case ISD::OR:
    case ISD::XOR: {
      SDValue Op1 = U->getOperand(1);

      // If the other operand is a 8-bit immediate we should fold the immediate
      // instead. This reduces code size.
      // e.g.
      // movl 4(%esp), %eax
      // addl $4, %eax
      // vs.
      // movl $4, %eax
      // addl 4(%esp), %eax
      // The former is 2 bytes shorter. In case where the increment is 1, then
      // the saving can be 4 bytes (by using incl %eax).
      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
        if (Imm->getAPIntValue().isSignedIntN(8))
          return false;

      // If the other operand is a TLS address, we should fold it instead.
      // This produces
      // movl    %gs:0, %eax
      // leal    i@NTPOFF(%eax), %eax
      // instead of
      // movl    $i@NTPOFF, %eax
      // addl    %gs:0, %eax
      // if the block also has an access to a second TLS address this will save
      // a load.
      // FIXME: This is probably also true for non TLS addresses.
      if (Op1.getOpcode() == X86ISD::Wrapper) {
        SDValue Val = Op1.getOperand(0);
        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
          return false;
      }
    }
    }

  // Proceed to 'generic' cycle finder code
  return SelectionDAGISel::IsLegalAndProfitableToFold(N, U, Root);
}

/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
/// and move load below the TokenFactor. Replace store's chain operand with
/// load's chain result.
static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
                                 SDValue Store, SDValue TF) {
  SmallVector<SDValue, 4> Ops;
  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
    if (Load.getNode() == TF.getOperand(i).getNode())
      Ops.push_back(Load.getOperand(0));
    else
      Ops.push_back(TF.getOperand(i));
  SDValue NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
  SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
                                               Load.getOperand(1),
                                               Load.getOperand(2));
  CurDAG->UpdateNodeOperands(Store, NewLoad.getValue(1), Store.getOperand(1),
                             Store.getOperand(2), Store.getOperand(3));
}

/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.
/// 
static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
                      SDValue &Load) {
  if (N.getOpcode() == ISD::BIT_CONVERT)
    N = N.getOperand(0);

  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
  if (!LD || LD->isVolatile())
    return false;
  if (LD->getAddressingMode() != ISD::UNINDEXED)
    return false;

  ISD::LoadExtType ExtType = LD->getExtensionType();
  if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
    return false;

  if (N.hasOneUse() &&
      N.getOperand(1) == Address &&
      N.getNode()->isOperandOf(Chain.getNode())) {
    Load = N;
    return true;
  }
  return false;
}

/// MoveBelowCallSeqStart - Replace CALLSEQ_START operand with load's chain
/// operand and move load below the call's chain operand.
static void MoveBelowCallSeqStart(SelectionDAG *CurDAG, SDValue Load,
                                  SDValue Call, SDValue CallSeqStart) {
  SmallVector<SDValue, 8> Ops;
  SDValue Chain = CallSeqStart.getOperand(0);
  if (Chain.getNode() == Load.getNode())
    Ops.push_back(Load.getOperand(0));
  else {
    assert(Chain.getOpcode() == ISD::TokenFactor &&
           "Unexpected CallSeqStart chain operand");
    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
      if (Chain.getOperand(i).getNode() == Load.getNode())
        Ops.push_back(Load.getOperand(0));
      else
        Ops.push_back(Chain.getOperand(i));
    SDValue NewChain =
      CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
                      MVT::Other, &Ops[0], Ops.size());
    Ops.clear();
    Ops.push_back(NewChain);
  }
  for (unsigned i = 1, e = CallSeqStart.getNumOperands(); i != e; ++i)
    Ops.push_back(CallSeqStart.getOperand(i));
  CurDAG->UpdateNodeOperands(CallSeqStart, &Ops[0], Ops.size());
  CurDAG->UpdateNodeOperands(Load, Call.getOperand(0),
                             Load.getOperand(1), Load.getOperand(2));
  Ops.clear();
  Ops.push_back(SDValue(Load.getNode(), 1));
  for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i)
    Ops.push_back(Call.getOperand(i));
  CurDAG->UpdateNodeOperands(Call, &Ops[0], Ops.size());
}

/// isCalleeLoad - Return true if call address is a load and it can be
/// moved below CALLSEQ_START and the chains leading up to the call.
/// Return the CALLSEQ_START by reference as a second output.
static bool isCalleeLoad(SDValue Callee, SDValue &Chain) {
  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
    return false;
  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
  if (!LD ||
      LD->isVolatile() ||
      LD->getAddressingMode() != ISD::UNINDEXED ||
      LD->getExtensionType() != ISD::NON_EXTLOAD)
    return false;

  // Now let's find the callseq_start.
  while (Chain.getOpcode() != ISD::CALLSEQ_START) {
    if (!Chain.hasOneUse())
      return false;
    Chain = Chain.getOperand(0);
  }
  
  if (Chain.getOperand(0).getNode() == Callee.getNode())
    return true;
  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()))
    return true;
  return false;
}


/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
/// This is only run if not in -O0 mode.
/// This allows the instruction selector to pick more read-modify-write
/// instructions. This is a common case:
///
///     [Load chain]
///         ^
///         |
///       [Load]
///       ^    ^
///       |    |
///      /      \-
///     /         |
/// [TokenFactor] [Op]
///     ^          ^
///     |          |
///      \        /
///       \      /
///       [Store]
///
/// The fact the store's chain operand != load's chain will prevent the
/// (store (op (load))) instruction from being selected. We can transform it to:
///
///     [Load chain]
///         ^
///         |
///    [TokenFactor]
///         ^
///         |
///       [Load]
///       ^    ^
///       |    |
///       |     \- 
///       |       | 
///       |     [Op]
///       |       ^
///       |       |
///       \      /
///        \    /
///       [Store]
void X86DAGToDAGISel::PreprocessForRMW() {
  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
         E = CurDAG->allnodes_end(); I != E; ++I) {
    if (I->getOpcode() == X86ISD::CALL) {
      /// Also try moving call address load from outside callseq_start to just
      /// before the call to allow it to be folded.
      ///
      ///     [Load chain]
      ///         ^
      ///         |
      ///       [Load]
      ///       ^    ^
      ///       |    |
      ///      /      \--
      ///     /          |
      ///[CALLSEQ_START] |
      ///     ^          |
      ///     |          |
      /// [LOAD/C2Reg]   |
      ///     |          |
      ///      \        /
      ///       \      /
      ///       [CALL]
      SDValue Chain = I->getOperand(0);
      SDValue Load  = I->getOperand(1);
      if (!isCalleeLoad(Load, Chain))
        continue;
      MoveBelowCallSeqStart(CurDAG, Load, SDValue(I, 0), Chain);
      ++NumLoadMoved;
      continue;
    }

    if (!ISD::isNON_TRUNCStore(I))
      continue;
    SDValue Chain = I->getOperand(0);

    if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
      continue;

    SDValue N1 = I->getOperand(1);
    SDValue N2 = I->getOperand(2);
    if ((N1.getValueType().isFloatingPoint() &&
         !N1.getValueType().isVector()) ||
        !N1.hasOneUse())
      continue;

    bool RModW = false;
    SDValue Load;
    unsigned Opcode = N1.getNode()->getOpcode();
    switch (Opcode) {
    case ISD::ADD:
    case ISD::MUL:
    case ISD::AND:
    case ISD::OR:
    case ISD::XOR:
    case ISD::ADDC:
    case ISD::ADDE:
    case ISD::VECTOR_SHUFFLE: {
      SDValue N10 = N1.getOperand(0);
      SDValue N11 = N1.getOperand(1);
      RModW = isRMWLoad(N10, Chain, N2, Load);
      if (!RModW)
        RModW = isRMWLoad(N11, Chain, N2, Load);
      break;
    }
    case ISD::SUB:
    case ISD::SHL:
    case ISD::SRA:
    case ISD::SRL:
    case ISD::ROTL:
    case ISD::ROTR:
    case ISD::SUBC:
    case ISD::SUBE:
    case X86ISD::SHLD:
    case X86ISD::SHRD: {
      SDValue N10 = N1.getOperand(0);
      RModW = isRMWLoad(N10, Chain, N2, Load);
      break;
    }
    }

    if (RModW) {
      MoveBelowTokenFactor(CurDAG, Load, SDValue(I, 0), Chain);
      ++NumLoadMoved;
    }
  }
}


/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend
/// nodes that target the FP stack to be store and load to the stack.  This is a
/// gross hack.  We would like to simply mark these as being illegal, but when
/// we do that, legalize produces these when it expands calls, then expands
/// these in the same legalize pass.  We would like dag combine to be able to
/// hack on these between the call expansion and the node legalization.  As such
/// this pass basically does "really late" legalization of these inline with the
/// X86 isel pass.
void X86DAGToDAGISel::PreprocessForFPConvert() {
  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
       E = CurDAG->allnodes_end(); I != E; ) {
    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
      continue;
    
    // If the source and destination are SSE registers, then this is a legal
    // conversion that should not be lowered.
    EVT SrcVT = N->getOperand(0).getValueType();
    EVT DstVT = N->getValueType(0);
    bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
    bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
    if (SrcIsSSE && DstIsSSE)
      continue;

    if (!SrcIsSSE && !DstIsSSE) {
      // If this is an FPStack extension, it is a noop.
      if (N->getOpcode() == ISD::FP_EXTEND)
        continue;
      // If this is a value-preserving FPStack truncation, it is a noop.
      if (N->getConstantOperandVal(1))
        continue;
    }
   
    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
    // FPStack has extload and truncstore.  SSE can fold direct loads into other
    // operations.  Based on this, decide what we want to do.
    EVT MemVT;
    if (N->getOpcode() == ISD::FP_ROUND)
      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
    else
      MemVT = SrcIsSSE ? SrcVT : DstVT;
    
    SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
    DebugLoc dl = N->getDebugLoc();
    
    // FIXME: optimize the case where the src/dest is a load or store?
    SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
                                          N->getOperand(0),
                                          MemTmp, NULL, 0, MemVT);
    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
                                        NULL, 0, MemVT);

    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
    // extload we created.  This will cause general havok on the dag because
    // anything below the conversion could be folded into other existing nodes.
    // To avoid invalidating 'I', back it up to the convert node.
    --I;
    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
    
    // Now that we did that, the node is dead.  Increment the iterator to the
    // next node to process, then delete N.
    ++I;
    CurDAG->DeleteNode(N);
  }  
}

/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
/// when it has created a SelectionDAG for us to codegen.
void X86DAGToDAGISel::InstructionSelect() {
  const Function *F = MF->getFunction();
  OptForSize = F->hasFnAttr(Attribute::OptimizeForSize);

  DEBUG(BB->dump());
  if (OptLevel != CodeGenOpt::None)
    PreprocessForRMW();

  // FIXME: This should only happen when not compiled with -O0.
  PreprocessForFPConvert();

  // Codegen the basic block.
#ifndef NDEBUG
  DEBUG(errs() << "===== Instruction selection begins:\n");
  Indent = 0;
#endif
  SelectRoot(*CurDAG);
#ifndef NDEBUG
  DEBUG(errs() << "===== Instruction selection ends:\n");
#endif

  CurDAG->RemoveDeadNodes();
}

/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
/// the main function.
void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
                                             MachineFrameInfo *MFI) {
  const TargetInstrInfo *TII = TM.getInstrInfo();
  if (Subtarget->isTargetCygMing())
    BuildMI(BB, DebugLoc::getUnknownLoc(),
            TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
}

void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
  // If this is main, emit special code for main.
  MachineBasicBlock *BB = MF.begin();
  if (Fn.hasExternalLinkage() && Fn.getName() == "main")
    EmitSpecialCodeForMain(BB, MF.getFrameInfo());
}


bool X86DAGToDAGISel::MatchSegmentBaseAddress(SDValue N,
                                              X86ISelAddressMode &AM) {
  assert(N.getOpcode() == X86ISD::SegmentBaseAddress);
  SDValue Segment = N.getOperand(0);

  if (AM.Segment.getNode() == 0) {
    AM.Segment = Segment;
    return false;
  }

  return true;
}

bool X86DAGToDAGISel::MatchLoad(SDValue N, X86ISelAddressMode &AM) {
  // This optimization is valid because the GNU TLS model defines that
  // gs:0 (or fs:0 on X86-64) contains its own address.
  // For more information see http://people.redhat.com/drepper/tls.pdf

  SDValue Address = N.getOperand(1);
  if (Address.getOpcode() == X86ISD::SegmentBaseAddress &&
      !MatchSegmentBaseAddress (Address, AM))
    return false;

  return true;
}

/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
/// into an addressing mode.  These wrap things that will resolve down into a
/// symbol reference.  If no match is possible, this returns true, otherwise it
/// returns false.
bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
  // If the addressing mode already has a symbol as the displacement, we can
  // never match another symbol.
  if (AM.hasSymbolicDisplacement())
    return true;

  SDValue N0 = N.getOperand(0);
  CodeModel::Model M = TM.getCodeModel();

  // Handle X86-64 rip-relative addresses.  We check this before checking direct
  // folding because RIP is preferable to non-RIP accesses.
  if (Subtarget->is64Bit() &&
      // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
      // they cannot be folded into immediate fields.
      // FIXME: This can be improved for kernel and other models?
      (M == CodeModel::Small || M == CodeModel::Kernel) &&
      // Base and index reg must be 0 in order to use %rip as base and lowering
      // must allow RIP.
      !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) {
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
      int64_t Offset = AM.Disp + G->getOffset();
      if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true;
      AM.GV = G->getGlobal();
      AM.Disp = Offset;
      AM.SymbolFlags = G->getTargetFlags();
    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
      int64_t Offset = AM.Disp + CP->getOffset();
      if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true;
      AM.CP = CP->getConstVal();
      AM.Align = CP->getAlignment();
      AM.Disp = Offset;
      AM.SymbolFlags = CP->getTargetFlags();
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
      AM.ES = S->getSymbol();
      AM.SymbolFlags = S->getTargetFlags();
    } else {
      JumpTableSDNode *J = cast<JumpTableSDNode>(N0);
      AM.JT = J->getIndex();
      AM.SymbolFlags = J->getTargetFlags();
    }

    if (N.getOpcode() == X86ISD::WrapperRIP)
      AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
    return false;
  }

  // Handle the case when globals fit in our immediate field: This is true for
  // X86-32 always and X86-64 when in -static -mcmodel=small mode.  In 64-bit
  // mode, this results in a non-RIP-relative computation.
  if (!Subtarget->is64Bit() ||
      ((M == CodeModel::Small || M == CodeModel::Kernel) &&
       TM.getRelocationModel() == Reloc::Static)) {
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
      AM.GV = G->getGlobal();
      AM.Disp += G->getOffset();
      AM.SymbolFlags = G->getTargetFlags();
    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
      AM.CP = CP->getConstVal();
      AM.Align = CP->getAlignment();
      AM.Disp += CP->getOffset();
      AM.SymbolFlags = CP->getTargetFlags();
    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
      AM.ES = S->getSymbol();
      AM.SymbolFlags = S->getTargetFlags();
    } else {
      JumpTableSDNode *J = cast<JumpTableSDNode>(N0);
      AM.JT = J->getIndex();
      AM.SymbolFlags = J->getTargetFlags();
    }
    return false;
  }

  return true;
}

/// MatchAddress - Add the specified node to the specified addressing mode,
/// returning true if it cannot be done.  This just pattern matches for the
/// addressing mode.
bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
  if (MatchAddressRecursively(N, AM, 0))
    return true;

  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
  // a smaller encoding and avoids a scaled-index.
  if (AM.Scale == 2 &&
      AM.BaseType == X86ISelAddressMode::RegBase &&
      AM.Base.Reg.getNode() == 0) {
    AM.Base.Reg = AM.IndexReg;
    AM.Scale = 1;
  }

  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
  // because it has a smaller encoding.
  // TODO: Which other code models can use this?
  if (TM.getCodeModel() == CodeModel::Small &&
      Subtarget->is64Bit() &&
      AM.Scale == 1 &&
      AM.BaseType == X86ISelAddressMode::RegBase &&
      AM.Base.Reg.getNode() == 0 &&
      AM.IndexReg.getNode() == 0 &&
      AM.SymbolFlags == X86II::MO_NO_FLAG &&
      AM.hasSymbolicDisplacement())
    AM.Base.Reg = CurDAG->getRegister(X86::RIP, MVT::i64);

  return false;
}

bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                              unsigned Depth) {
  bool is64Bit = Subtarget->is64Bit();
  DebugLoc dl = N.getDebugLoc();
  DEBUG({
      errs() << "MatchAddress: ";
      AM.dump();
    });
  // Limit recursion.
  if (Depth > 5)
    return MatchAddressBase(N, AM);

  CodeModel::Model M = TM.getCodeModel();

  // If this is already a %rip relative address, we can only merge immediates
  // into it.  Instead of handling this in every case, we handle it here.
  // RIP relative addressing: %rip + 32-bit displacement!
  if (AM.isRIPRelative()) {
    // FIXME: JumpTable and ExternalSymbol address currently don't like
    // displacements.  It isn't very important, but this should be fixed for
    // consistency.
    if (!AM.ES && AM.JT != -1) return true;

    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) {
      int64_t Val = AM.Disp + Cst->getSExtValue();
      if (X86::isOffsetSuitableForCodeModel(Val, M,
                                            AM.hasSymbolicDisplacement())) {
        AM.Disp = Val;
        return false;
      }
    }
    return true;
  }

  switch (N.getOpcode()) {
  default: break;
  case ISD::Constant: {
    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
    if (!is64Bit ||
        X86::isOffsetSuitableForCodeModel(AM.Disp + Val, M,
                                          AM.hasSymbolicDisplacement())) {
      AM.Disp += Val;
      return false;
    }
    break;
  }

  case X86ISD::SegmentBaseAddress:
    if (!MatchSegmentBaseAddress(N, AM))
      return false;
    break;

  case X86ISD::Wrapper:
  case X86ISD::WrapperRIP:
    if (!MatchWrapper(N, AM))
      return false;
    break;

  case ISD::LOAD:
    if (!MatchLoad(N, AM))
      return false;
    break;

  case ISD::FrameIndex:
    if (AM.BaseType == X86ISelAddressMode::RegBase
        && AM.Base.Reg.getNode() == 0) {
      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
      return false;
    }
    break;

  case ISD::SHL:
    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
      break;
      
    if (ConstantSDNode
          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
      unsigned Val = CN->getZExtValue();
      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
      // that the base operand remains free for further matching. If
      // the base doesn't end up getting used, a post-processing step
      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
      if (Val == 1 || Val == 2 || Val == 3) {
        AM.Scale = 1 << Val;
        SDValue ShVal = N.getNode()->getOperand(0);

        // Okay, we know that we have a scale by now.  However, if the scaled
        // value is an add of something and a constant, we can fold the
        // constant into the disp field here.
        if (ShVal.getNode()->getOpcode() == ISD::ADD && ShVal.hasOneUse() &&
            isa<ConstantSDNode>(ShVal.getNode()->getOperand(1))) {
          AM.IndexReg = ShVal.getNode()->getOperand(0);
          ConstantSDNode *AddVal =
            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
          uint64_t Disp = AM.Disp + (AddVal->getSExtValue() << Val);
          if (!is64Bit ||
              X86::isOffsetSuitableForCodeModel(Disp, M,
                                                AM.hasSymbolicDisplacement()))
            AM.Disp = Disp;
          else
            AM.IndexReg = ShVal;
        } else {
          AM.IndexReg = ShVal;
        }
        return false;
      }
    break;
    }

  case ISD::SMUL_LOHI:
  case ISD::UMUL_LOHI:
    // A mul_lohi where we need the low part can be folded as a plain multiply.
    if (N.getResNo() != 0) break;
    // FALL THROUGH
  case ISD::MUL:
  case X86ISD::MUL_IMM:
    // X*[3,5,9] -> X+X*[2,4,8]
    if (AM.BaseType == X86ISelAddressMode::RegBase &&
        AM.Base.Reg.getNode() == 0 &&
        AM.IndexReg.getNode() == 0) {
      if (ConstantSDNode
            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
            CN->getZExtValue() == 9) {
          AM.Scale = unsigned(CN->getZExtValue())-1;

          SDValue MulVal = N.getNode()->getOperand(0);
          SDValue Reg;

          // Okay, we know that we have a scale by now.  However, if the scaled
          // value is an add of something and a constant, we can fold the
          // constant into the disp field here.
          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
            Reg = MulVal.getNode()->getOperand(0);
            ConstantSDNode *AddVal =
              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
            uint64_t Disp = AM.Disp + AddVal->getSExtValue() *
                                      CN->getZExtValue();
            if (!is64Bit ||
                X86::isOffsetSuitableForCodeModel(Disp, M,
                                                  AM.hasSymbolicDisplacement()))
              AM.Disp = Disp;
            else
              Reg = N.getNode()->getOperand(0);
          } else {
            Reg = N.getNode()->getOperand(0);
          }

          AM.IndexReg = AM.Base.Reg = Reg;
          return false;
        }
    }
    break;

  case ISD::SUB: {
    // Given A-B, if A can be completely folded into the address and
    // the index field with the index field unused, use -B as the index.
    // This is a win if a has multiple parts that can be folded into
    // the address. Also, this saves a mov if the base register has
    // other uses, since it avoids a two-address sub instruction, however
    // it costs an additional mov if the index register has other uses.

    // Test if the LHS of the sub can be folded.
    X86ISelAddressMode Backup = AM;
    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
      AM = Backup;
      break;