X86ISelLowering.cpp

//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file was developed by Chris Lattner and is distributed under
// the University of Illinois Open Source License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the interfaces that X86 uses to lower LLVM code into a
// selection DAG.
//
//===----------------------------------------------------------------------===//

#include "X86.h"
#include "X86InstrBuilder.h"
#include "X86ISelLowering.h"
#include "X86MachineFunctionInfo.h"
#include "X86TargetMachine.h"
#include "llvm/CallingConv.h"
#include "llvm/Constants.h"
#include "llvm/DerivedTypes.h"
#include "llvm/Function.h"
#include "llvm/Intrinsics.h"
#include "llvm/ADT/VectorExtras.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SSARegMap.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/ADT/StringExtras.h"
using namespace llvm;

// FIXME: temporary.
static cl::opt<bool> EnableFastCC("enable-x86-fastcc", cl::Hidden,
                                  cl::desc("Enable fastcc on X86"));
X86TargetLowering::X86TargetLowering(TargetMachine &TM)
  : TargetLowering(TM) {
  Subtarget = &TM.getSubtarget<X86Subtarget>();
  X86ScalarSSE = Subtarget->hasSSE2();
  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;

  // Set up the TargetLowering object.

  // X86 is weird, it always uses i8 for shift amounts and setcc results.
  setShiftAmountType(MVT::i8);
  setSetCCResultType(MVT::i8);
  setSetCCResultContents(ZeroOrOneSetCCResult);
  setSchedulingPreference(SchedulingForRegPressure);
  setShiftAmountFlavor(Mask);   // shl X, 32 == shl X, 0
  setStackPointerRegisterToSaveRestore(X86StackPtr);

  if (Subtarget->isTargetDarwin()) {
    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
    setUseUnderscoreSetJmp(false);
    setUseUnderscoreLongJmp(false);
  } else if (Subtarget->isTargetMingw()) {
    // MS runtime is weird: it exports _setjmp, but longjmp!
    setUseUnderscoreSetJmp(true);
    setUseUnderscoreLongJmp(false);
  } else {
    setUseUnderscoreSetJmp(true);
    setUseUnderscoreLongJmp(true);
  }
  
  // Add legal addressing mode scale values.
  addLegalAddressScale(8);
  addLegalAddressScale(4);
  addLegalAddressScale(2);
  // Enter the ones which require both scale + index last. These are more
  // expensive.
  addLegalAddressScale(9);
  addLegalAddressScale(5);
  addLegalAddressScale(3);

  // Set up the register classes.
  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
  addRegisterClass(MVT::i16, X86::GR16RegisterClass);
  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
  if (Subtarget->is64Bit())
    addRegisterClass(MVT::i64, X86::GR64RegisterClass);

  setLoadXAction(ISD::SEXTLOAD, MVT::i1, Expand);

  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
  // operation.
  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);

  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
  } else {
    if (X86ScalarSSE)
      // If SSE i64 SINT_TO_FP is not available, expand i32 UINT_TO_FP.
      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Expand);
    else
      setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Promote);
  }

  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
  // this operation.
  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
  // SSE has no i16 to fp conversion, only i32
  if (X86ScalarSSE)
    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
  else {
    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
  }

  if (!Subtarget->is64Bit()) {
    // Custom lower SINT_TO_FP and FP_TO_SINT from/to i64 in 32-bit mode.
    setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
    setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
  }

  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
  // this operation.
  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);

  if (X86ScalarSSE) {
    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
  } else {
    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
  }

  // Handle FP_TO_UINT by promoting the destination to a larger signed
  // conversion.
  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);

  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
  } else {
    if (X86ScalarSSE && !Subtarget->hasSSE3())
      // Expand FP_TO_UINT into a select.
      // FIXME: We would like to use a Custom expander here eventually to do
      // the optimal thing for SSE vs. the default expansion in the legalizer.
      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
    else
      // With SSE3 we can use fisttpll to convert to a signed i64.
      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Promote);
  }

  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
  if (!X86ScalarSSE) {
    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
  }

  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
  setOperationAction(ISD::MEMMOVE          , MVT::Other, Expand);
  if (Subtarget->is64Bit())
    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Expand);
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Expand);
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Expand);
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
  setOperationAction(ISD::FREM             , MVT::f64  , Expand);

  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
  setOperationAction(ISD::CTTZ             , MVT::i8   , Expand);
  setOperationAction(ISD::CTLZ             , MVT::i8   , Expand);
  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
  setOperationAction(ISD::CTTZ             , MVT::i16  , Expand);
  setOperationAction(ISD::CTLZ             , MVT::i16  , Expand);
  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
  setOperationAction(ISD::CTTZ             , MVT::i32  , Expand);
  setOperationAction(ISD::CTLZ             , MVT::i32  , Expand);
  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
    setOperationAction(ISD::CTTZ           , MVT::i64  , Expand);
    setOperationAction(ISD::CTLZ           , MVT::i64  , Expand);
  }

  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);

  // These should be promoted to a larger select which is supported.
  setOperationAction(ISD::SELECT           , MVT::i1   , Promote);
  setOperationAction(ISD::SELECT           , MVT::i8   , Promote);
  // X86 wants to expand cmov itself.
  setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
  setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
  }
  // X86 ret instruction may pop stack.
  setOperationAction(ISD::RET             , MVT::Other, Custom);
  // Darwin ABI issue.
  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
  if (Subtarget->is64Bit()) {
    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
  }
  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
  // X86 wants to expand memset / memcpy itself.
  setOperationAction(ISD::MEMSET          , MVT::Other, Custom);
  setOperationAction(ISD::MEMCPY          , MVT::Other, Custom);

  // We don't have line number support yet.
  setOperationAction(ISD::LOCATION, MVT::Other, Expand);
  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);
  // FIXME - use subtarget debug flags
  if (!Subtarget->isTargetDarwin() &&
      !Subtarget->isTargetELF() &&
      !Subtarget->isTargetCygMing())
    setOperationAction(ISD::LABEL, MVT::Other, Expand);

  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
  setOperationAction(ISD::VASTART           , MVT::Other, Custom);

  // Use the default implementation.
  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
  if (Subtarget->is64Bit())
    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);

  if (X86ScalarSSE) {
    // Set up the FP register classes.
    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
    addRegisterClass(MVT::f64, X86::FR64RegisterClass);

    // Use ANDPD to simulate FABS.
    setOperationAction(ISD::FABS , MVT::f64, Custom);
    setOperationAction(ISD::FABS , MVT::f32, Custom);

    // Use XORP to simulate FNEG.
    setOperationAction(ISD::FNEG , MVT::f64, Custom);
    setOperationAction(ISD::FNEG , MVT::f32, Custom);

    // Use ANDPD and ORPD to simulate FCOPYSIGN.
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

    // We don't support sin/cos/fmod
    setOperationAction(ISD::FSIN , MVT::f64, Expand);
    setOperationAction(ISD::FCOS , MVT::f64, Expand);
    setOperationAction(ISD::FREM , MVT::f64, Expand);
    setOperationAction(ISD::FSIN , MVT::f32, Expand);
    setOperationAction(ISD::FCOS , MVT::f32, Expand);
    setOperationAction(ISD::FREM , MVT::f32, Expand);

    // Expand FP immediates into loads from the stack, except for the special
    // cases we handle.
    setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
    setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
    addLegalFPImmediate(+0.0); // xorps / xorpd
  } else {
    // Set up the FP register classes.
    addRegisterClass(MVT::f64, X86::RFPRegisterClass);

    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

    if (!UnsafeFPMath) {
      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
    }

    setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
    addLegalFPImmediate(+0.0); // FLD0
    addLegalFPImmediate(+1.0); // FLD1
    addLegalFPImmediate(-0.0); // FLD0/FCHS
    addLegalFPImmediate(-1.0); // FLD1/FCHS
  }

  // First set operation action for all vector types to expand. Then we
  // will selectively turn on ones that can be effectively codegen'd.
  for (unsigned VT = (unsigned)MVT::Vector + 1;
       VT != (unsigned)MVT::LAST_VALUETYPE; VT++) {
    setOperationAction(ISD::ADD , (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::SUB , (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::FADD, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::FSUB, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::MUL , (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::FMUL, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::SDIV, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::UDIV, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::FDIV, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::SREM, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::UREM, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::LOAD, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::VECTOR_SHUFFLE,     (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, (MVT::ValueType)VT, Expand);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  (MVT::ValueType)VT, Expand);
  }

  if (Subtarget->hasMMX()) {
    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);

    // FIXME: add MMX packed arithmetics
    setOperationAction(ISD::BUILD_VECTOR,     MVT::v8i8,  Expand);
    setOperationAction(ISD::BUILD_VECTOR,     MVT::v4i16, Expand);
    setOperationAction(ISD::BUILD_VECTOR,     MVT::v2i32, Expand);
  }

  if (Subtarget->hasSSE1()) {
    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);

    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
  }

  if (Subtarget->hasSSE2()) {
    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);

    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);

    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
    // Implement v4f32 insert_vector_elt in terms of SSE2 v8i16 ones.
    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);

    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
      setOperationAction(ISD::BUILD_VECTOR,        (MVT::ValueType)VT, Custom);
      setOperationAction(ISD::VECTOR_SHUFFLE,      (MVT::ValueType)VT, Custom);
      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  (MVT::ValueType)VT, Custom);
    }
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);

    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
    for (unsigned VT = (unsigned)MVT::v16i8; VT != (unsigned)MVT::v2i64; VT++) {
      setOperationAction(ISD::AND,    (MVT::ValueType)VT, Promote);
      AddPromotedToType (ISD::AND,    (MVT::ValueType)VT, MVT::v2i64);
      setOperationAction(ISD::OR,     (MVT::ValueType)VT, Promote);
      AddPromotedToType (ISD::OR,     (MVT::ValueType)VT, MVT::v2i64);
      setOperationAction(ISD::XOR,    (MVT::ValueType)VT, Promote);
      AddPromotedToType (ISD::XOR,    (MVT::ValueType)VT, MVT::v2i64);
      setOperationAction(ISD::LOAD,   (MVT::ValueType)VT, Promote);
      AddPromotedToType (ISD::LOAD,   (MVT::ValueType)VT, MVT::v2i64);
      setOperationAction(ISD::SELECT, (MVT::ValueType)VT, Promote);
      AddPromotedToType (ISD::SELECT, (MVT::ValueType)VT, MVT::v2i64);
    }

    // Custom lower v2i64 and v2f64 selects.
    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
  }

  // We want to custom lower some of our intrinsics.
  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

  // We have target-specific dag combine patterns for the following nodes:
  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
  setTargetDAGCombine(ISD::SELECT);

  computeRegisterProperties();

  // FIXME: These should be based on subtarget info. Plus, the values should
  // be smaller when we are in optimizing for size mode.
  maxStoresPerMemset = 16; // For %llvm.memset -> sequence of stores
  maxStoresPerMemcpy = 16; // For %llvm.memcpy -> sequence of stores
  maxStoresPerMemmove = 16; // For %llvm.memmove -> sequence of stores
  allowUnalignedMemoryAccesses = true; // x86 supports it!
}

//===----------------------------------------------------------------------===//
//                C & StdCall Calling Convention implementation
//===----------------------------------------------------------------------===//
//  StdCall calling convention seems to be standard for many Windows' API
//  routines and around. It differs from C calling convention just a little:
//  callee should clean up the stack, not caller. Symbols should be also
//  decorated in some fancy way :) It doesn't support any vector arguments.

/// AddLiveIn - This helper function adds the specified physical register to the
/// MachineFunction as a live in value.  It also creates a corresponding virtual
/// register for it.
static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
                          const TargetRegisterClass *RC) {
  assert(RC->contains(PReg) && "Not the correct regclass!");
  unsigned VReg = MF.getSSARegMap()->createVirtualRegister(RC);
  MF.addLiveIn(PReg, VReg);
  return VReg;
}

/// HowToPassArgument - Returns how an formal argument of the specified type
/// should be passed. If it is through stack, returns the size of the stack
/// slot; if it is through integer or XMM register, returns the number of
/// integer or XMM registers are needed.
static void
HowToPassCallArgument(MVT::ValueType ObjectVT,
                      bool ArgInReg,
                      unsigned NumIntRegs, unsigned NumXMMRegs,
                      unsigned MaxNumIntRegs,
                      unsigned &ObjSize, unsigned &ObjIntRegs,
                      unsigned &ObjXMMRegs,
                      bool AllowVectors = true) {
  ObjSize = 0;
  ObjIntRegs = 0;
  ObjXMMRegs = 0;

  if (MaxNumIntRegs>3) {
    // We don't have too much registers on ia32! :)
    MaxNumIntRegs = 3;
  }

  switch (ObjectVT) {
  default: assert(0 && "Unhandled argument type!");
  case MVT::i8:
   if (ArgInReg && (NumIntRegs < MaxNumIntRegs))
     ObjIntRegs = 1;
   else
     ObjSize = 1;
   break;
  case MVT::i16:
   if (ArgInReg && (NumIntRegs < MaxNumIntRegs))
     ObjIntRegs = 1;
   else
     ObjSize = 2;
   break;
  case MVT::i32:
   if (ArgInReg && (NumIntRegs < MaxNumIntRegs))
     ObjIntRegs = 1;
   else
     ObjSize = 4;
   break;
  case MVT::i64:
   if (ArgInReg && (NumIntRegs+2 <= MaxNumIntRegs)) {
     ObjIntRegs = 2;
   } else if (ArgInReg && (NumIntRegs+1 <= MaxNumIntRegs)) {
     ObjIntRegs = 1;
     ObjSize = 4;
   } else
     ObjSize = 8;
  case MVT::f32:
    ObjSize = 4;
    break;
  case MVT::f64:
    ObjSize = 8;
    break;
  case MVT::v16i8:
  case MVT::v8i16:
  case MVT::v4i32:
  case MVT::v2i64:
  case MVT::v4f32:
  case MVT::v2f64:
   if (AllowVectors) {
     if (NumXMMRegs < 4)
       ObjXMMRegs = 1;
     else
       ObjSize = 16;
     break;
   } else
     assert(0 && "Unhandled argument type [vector]!");
  }
}

SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
                                               bool isStdCall) {
  unsigned NumArgs = Op.Val->getNumValues() - 1;
  MachineFunction &MF = DAG.getMachineFunction();
  MachineFrameInfo *MFI = MF.getFrameInfo();
  SDOperand Root = Op.getOperand(0);
  SmallVector<SDOperand, 8> ArgValues;
  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;

  // Add DAG nodes to load the arguments...  On entry to a function on the X86,
  // the stack frame looks like this:
  //
  // [ESP] -- return address
  // [ESP + 4] -- first argument (leftmost lexically)
  // [ESP + 8] -- second argument, if first argument is <= 4 bytes in size
  //    ...
  //
  unsigned ArgOffset   = 0; // Frame mechanisms handle retaddr slot
  unsigned NumSRetBytes= 0; // How much bytes on stack used for struct return
  unsigned NumXMMRegs  = 0; // XMM regs used for parameter passing.
  unsigned NumIntRegs  = 0; // Integer regs used for parameter passing
  
  static const unsigned XMMArgRegs[] = {
    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
  };
  static const unsigned GPRArgRegs[][3] = {
    { X86::AL,  X86::DL,  X86::CL  },
    { X86::AX,  X86::DX,  X86::CX  },
    { X86::EAX, X86::EDX, X86::ECX }
  };
  static const TargetRegisterClass* GPRClasses[3] = {
    X86::GR8RegisterClass, X86::GR16RegisterClass, X86::GR32RegisterClass
  };
  
  // Handle regparm attribute
  SmallVector<bool, 8> ArgInRegs(NumArgs, false);
  SmallVector<bool, 8> SRetArgs(NumArgs, false);
  if (!isVarArg) {
    for (unsigned i = 0; i<NumArgs; ++i) {
      unsigned Flags = cast<ConstantSDNode>(Op.getOperand(3+i))->getValue();
      ArgInRegs[i]   = (Flags >> 1) & 1;
      SRetArgs[i]    = (Flags >> 2) & 1;
    }
  }
  
  for (unsigned i = 0; i < NumArgs; ++i) {
    MVT::ValueType ObjectVT = Op.getValue(i).getValueType();
    unsigned ArgIncrement = 4;
    unsigned ObjSize = 0;
    unsigned ObjXMMRegs = 0;
    unsigned ObjIntRegs = 0;
    unsigned Reg = 0;
    SDOperand ArgValue;   

    HowToPassCallArgument(ObjectVT,
                          ArgInRegs[i],
                          NumIntRegs, NumXMMRegs, 3,
                          ObjSize, ObjIntRegs, ObjXMMRegs,
                          !isStdCall);

    if (ObjSize > 4)
      ArgIncrement = ObjSize;

    if (ObjIntRegs || ObjXMMRegs) {
      switch (ObjectVT) {
      default: assert(0 && "Unhandled argument type!");
      case MVT::i8:
      case MVT::i16:
      case MVT::i32: {          
       unsigned RegToUse = GPRArgRegs[ObjectVT-MVT::i8][NumIntRegs];
       Reg = AddLiveIn(MF, RegToUse, GPRClasses[ObjectVT-MVT::i8]);
       ArgValue = DAG.getCopyFromReg(Root, Reg, ObjectVT);
       break;
      }       
      case MVT::v16i8:
      case MVT::v8i16:
      case MVT::v4i32:
      case MVT::v2i64:
      case MVT::v4f32:
      case MVT::v2f64:
       assert(!isStdCall && "Unhandled argument type!");
       Reg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs], X86::VR128RegisterClass);
       ArgValue = DAG.getCopyFromReg(Root, Reg, ObjectVT);
       break;
      }
      NumIntRegs += ObjIntRegs;
      NumXMMRegs += ObjXMMRegs;
    }
    if (ObjSize) {
      // XMM arguments have to be aligned on 16-byte boundary.
      if (ObjSize == 16)
        ArgOffset = ((ArgOffset + 15) / 16) * 16;
      // Create the SelectionDAG nodes corresponding to a load from this
      // parameter.
      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
      ArgValue = DAG.getLoad(Op.Val->getValueType(i), Root, FIN, NULL, 0);
      
      ArgOffset += ArgIncrement;   // Move on to the next argument.
      if (SRetArgs[i])
        NumSRetBytes += ArgIncrement;
    }

    ArgValues.push_back(ArgValue);
  }

  ArgValues.push_back(Root);

  // If the function takes variable number of arguments, make a frame index for
  // the start of the first vararg value... for expansion of llvm.va_start.
  if (isVarArg)
    VarArgsFrameIndex = MFI->CreateFixedObject(1, ArgOffset);

  if (isStdCall && !isVarArg) {
    BytesToPopOnReturn  = ArgOffset;    // Callee pops everything..
    BytesCallerReserves = 0;
  } else {
    BytesToPopOnReturn  = NumSRetBytes; // Callee pops hidden struct pointer.
    BytesCallerReserves = ArgOffset;
  }
  
  RegSaveFrameIndex = 0xAAAAAAA;  // X86-64 only.
  ReturnAddrIndex = 0;            // No return address slot generated yet.


  MF.getInfo<X86FunctionInfo>()->setBytesToPopOnReturn(BytesToPopOnReturn);

  // Return the new list of results.
  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
                     &ArgValues[0], ArgValues.size());
}

SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
                                            bool isStdCall) {
  SDOperand Chain     = Op.getOperand(0);
  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
  bool isTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
  SDOperand Callee    = Op.getOperand(4);
  MVT::ValueType RetVT= Op.Val->getValueType(0);
  unsigned NumOps     = (Op.getNumOperands() - 5) / 2;

  static const unsigned XMMArgRegs[] = {
    X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
  };
  static const unsigned GPR32ArgRegs[] = {
    X86::EAX, X86::EDX,  X86::ECX
  };

  // Count how many bytes are to be pushed on the stack.
  unsigned NumBytes   = 0;
  // Keep track of the number of integer regs passed so far.
  unsigned NumIntRegs = 0;
  // Keep track of the number of XMM regs passed so far.
  unsigned NumXMMRegs = 0;
  // How much bytes on stack used for struct return
  unsigned NumSRetBytes= 0; 

  // Handle regparm attribute
  SmallVector<bool, 8> ArgInRegs(NumOps, false);
  SmallVector<bool, 8> SRetArgs(NumOps, false);
  for (unsigned i = 0; i<NumOps; ++i) {
    unsigned Flags =
      dyn_cast<ConstantSDNode>(Op.getOperand(5+2*i+1))->getValue();
    ArgInRegs[i] = (Flags >> 1) & 1;
    SRetArgs[i]  = (Flags >> 2) & 1;
  }
  
  // Calculate stack frame size
  for (unsigned i = 0; i != NumOps; ++i) {
    SDOperand Arg = Op.getOperand(5+2*i);
    unsigned ArgIncrement = 4;
    unsigned ObjSize = 0;
    unsigned ObjIntRegs = 0;
    unsigned ObjXMMRegs = 0;

    HowToPassCallArgument(Arg.getValueType(),
                          ArgInRegs[i],
                          NumIntRegs, NumXMMRegs, 3,
                          ObjSize, ObjIntRegs, ObjXMMRegs,
                          !isStdCall);
    if (ObjSize > 4)
      ArgIncrement = ObjSize;

    NumIntRegs += ObjIntRegs;
    NumXMMRegs += ObjXMMRegs;
    if (ObjSize) {
      // XMM arguments have to be aligned on 16-byte boundary.
      if (ObjSize == 16)
        NumBytes = ((NumBytes + 15) / 16) * 16;
      NumBytes += ArgIncrement;
    }
  }

  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));

  // Arguments go on the stack in reverse order, as specified by the ABI.
  unsigned ArgOffset = 0;
  NumXMMRegs = 0;
  NumIntRegs = 0;
  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
  SmallVector<SDOperand, 8> MemOpChains;
  SDOperand StackPtr = DAG.getRegister(X86StackPtr, getPointerTy());
  for (unsigned i = 0; i != NumOps; ++i) {
    SDOperand Arg = Op.getOperand(5+2*i);
    unsigned ArgIncrement = 4;
    unsigned ObjSize = 0;
    unsigned ObjIntRegs = 0;
    unsigned ObjXMMRegs = 0;

    HowToPassCallArgument(Arg.getValueType(),
                          ArgInRegs[i],
                          NumIntRegs, NumXMMRegs, 3,
                          ObjSize, ObjIntRegs, ObjXMMRegs,
                          !isStdCall);
    
    if (ObjSize > 4)
      ArgIncrement = ObjSize;

    if (Arg.getValueType() == MVT::i8 || Arg.getValueType() == MVT::i16) {
      // Promote the integer to 32 bits.  If the input type is signed use a
      // sign extend, otherwise use a zero extend.
      unsigned Flags = cast<ConstantSDNode>(Op.getOperand(5+2*i+1))->getValue();

      unsigned ExtOp = (Flags & 1) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
      Arg = DAG.getNode(ExtOp, MVT::i32, Arg);
    }

    if (ObjIntRegs || ObjXMMRegs) {
      switch (Arg.getValueType()) {
      default: assert(0 && "Unhandled argument type!");
      case MVT::i32:
       RegsToPass.push_back(std::make_pair(GPR32ArgRegs[NumIntRegs], Arg));
       break;
      case MVT::v16i8:
      case MVT::v8i16:
      case MVT::v4i32:
      case MVT::v2i64:
      case MVT::v4f32:
      case MVT::v2f64:
       assert(!isStdCall && "Unhandled argument type!");
       RegsToPass.push_back(std::make_pair(XMMArgRegs[NumXMMRegs], Arg));
       break;
      }

      NumIntRegs += ObjIntRegs;
      NumXMMRegs += ObjXMMRegs;
    }
    if (ObjSize) {
      // XMM arguments have to be aligned on 16-byte boundary.
      if (ObjSize == 16)
        ArgOffset = ((ArgOffset + 15) / 16) * 16;
      
      SDOperand PtrOff = DAG.getConstant(ArgOffset, getPointerTy());
      PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
      MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
      
      ArgOffset += ArgIncrement;   // Move on to the next argument.
      if (SRetArgs[i])
        NumSRetBytes += ArgIncrement;
    }
  }

  // Sanity check: we haven't seen NumSRetBytes > 4
  assert((NumSRetBytes<=4) &&
         "Too much space for struct-return pointer requested");
    
  if (!MemOpChains.empty())
    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                        &MemOpChains[0], MemOpChains.size());

  // Build a sequence of copy-to-reg nodes chained together with token chain
  // and flag operands which copy the outgoing args into registers.
  SDOperand InFlag;
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
                             InFlag);
    InFlag = Chain.getValue(1);
  }

  // ELF / PIC requires GOT in the EBX register before function calls via PLT
  // GOT pointer.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      Subtarget->isPICStyleGOT()) {
    Chain = DAG.getCopyToReg(Chain, X86::EBX,
                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                             InFlag);
    InFlag = Chain.getValue(1);
  }
  
  // If the callee is a GlobalAddress node (quite common, every direct call is)
  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
    // We should use extra load for direct calls to dllimported functions in
    // non-JIT mode.
    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
                                        getTargetMachine(), true))
      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());

  // Returns a chain & a flag for retval copy to use.
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  SmallVector<SDOperand, 8> Ops;
  Ops.push_back(Chain);
  Ops.push_back(Callee);

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));

  // Add an implicit use GOT pointer in EBX.
  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      Subtarget->isPICStyleGOT())
    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
  
  if (InFlag.Val)
    Ops.push_back(InFlag);

  Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
                      NodeTys, &Ops[0], Ops.size());
  InFlag = Chain.getValue(1);

  // Create the CALLSEQ_END node.
  unsigned NumBytesForCalleeToPush = 0;

  if (isStdCall) {
    if (isVarArg) {
      NumBytesForCalleeToPush = NumSRetBytes;
    } else {
      NumBytesForCalleeToPush = NumBytes;
    }
  } else {
    // If this is is a call to a struct-return function, the callee
    // pops the hidden struct pointer, so we have to push it back.
    // This is common for Darwin/X86, Linux & Mingw32 targets.
    NumBytesForCalleeToPush = NumSRetBytes;
  }
  
  if (RetVT != MVT::Other)
    NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  else
    NodeTys = DAG.getVTList(MVT::Other);
  Ops.clear();
  Ops.push_back(Chain);
  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
  Ops.push_back(DAG.getConstant(NumBytesForCalleeToPush, getPointerTy()));
  Ops.push_back(InFlag);
  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
  if (RetVT != MVT::Other)
    InFlag = Chain.getValue(1);

  SmallVector<SDOperand, 8> ResultVals;
  switch (RetVT) {
  default: assert(0 && "Unknown value type to return!");
  case MVT::Other: break;
  case MVT::i8:
    Chain = DAG.getCopyFromReg(Chain, X86::AL, MVT::i8, InFlag).getValue(1);
    ResultVals.push_back(Chain.getValue(0));
    NodeTys = DAG.getVTList(MVT::i8, MVT::Other);
    break;
  case MVT::i16:
    Chain = DAG.getCopyFromReg(Chain, X86::AX, MVT::i16, InFlag).getValue(1);
    ResultVals.push_back(Chain.getValue(0));
    NodeTys = DAG.getVTList(MVT::i16, MVT::Other);
    break;
  case MVT::i32:
    if (Op.Val->getValueType(1) == MVT::i32) {
      Chain = DAG.getCopyFromReg(Chain, X86::EAX, MVT::i32, InFlag).getValue(1);
      ResultVals.push_back(Chain.getValue(0));
      Chain = DAG.getCopyFromReg(Chain, X86::EDX, MVT::i32,
                                 Chain.getValue(2)).getValue(1);
      ResultVals.push_back(Chain.getValue(0));
      NodeTys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
    } else {
      Chain = DAG.getCopyFromReg(Chain, X86::EAX, MVT::i32, InFlag).getValue(1);
      ResultVals.push_back(Chain.getValue(0));
      NodeTys = DAG.getVTList(MVT::i32, MVT::Other);
    }
    break;
  case MVT::v16i8:
  case MVT::v8i16:
  case MVT::v4i32:
  case MVT::v2i64:
  case MVT::v4f32:
  case MVT::v2f64:
    assert(!isStdCall && "Unknown value type to return!");
    Chain = DAG.getCopyFromReg(Chain, X86::XMM0, RetVT, InFlag).getValue(1);
    ResultVals.push_back(Chain.getValue(0));
    NodeTys = DAG.getVTList(RetVT, MVT::Other);
    break;
  case MVT::f32:
  case MVT::f64: {
    SDVTList Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
    SDOperand GROps[] = { Chain, InFlag };
    SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, GROps, 2);
    Chain  = RetVal.getValue(1);
    InFlag = RetVal.getValue(2);
    if (X86ScalarSSE) {
      // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
      // shouldn't be necessary except that RFP cannot be live across
      // multiple blocks. When stackifier is fixed, they can be uncoupled.
      MachineFunction &MF = DAG.getMachineFunction();
      int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
      SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
      Tys = DAG.getVTList(MVT::Other);
      SDOperand Ops[] = {
        Chain, RetVal, StackSlot, DAG.getValueType(RetVT), InFlag
      };
      Chain = DAG.getNode(X86ISD::FST, Tys, Ops, 5);
      RetVal = DAG.getLoad(RetVT, Chain, StackSlot, NULL, 0);
      Chain = RetVal.getValue(1);
    }

    if (RetVT == MVT::f32 && !X86ScalarSSE)
      // FIXME: we would really like to remember that this FP_ROUND
      // operation is okay to eliminate if we allow excess FP precision.
      RetVal = DAG.getNode(ISD::FP_ROUND, MVT::f32, RetVal);
    ResultVals.push_back(RetVal);
    NodeTys = DAG.getVTList(RetVT, MVT::Other);
    break;
  }
  }

  // If the function returns void, just return the chain.
  if (ResultVals.empty())
    return Chain;

  // Otherwise, merge everything together with a MERGE_VALUES node.
  ResultVals.push_back(Chain);
  SDOperand Res = DAG.getNode(ISD::MERGE_VALUES, NodeTys,
                              &ResultVals[0], ResultVals.size());
  return Res.getValue(Op.ResNo);
}


//===----------------------------------------------------------------------===//
//                 X86-64 C Calling Convention implementation
//===----------------------------------------------------------------------===//

/// HowToPassX86_64CCCArgument - Returns how an formal argument of the specified
/// type should be passed. If it is through stack, returns the size of the stack
/// slot; if it is through integer or XMM register, returns the number of
/// integer or XMM registers are needed.
static void
HowToPassX86_64CCCArgument(MVT::ValueType ObjectVT,
                           unsigned NumIntRegs, unsigned NumXMMRegs,
                           unsigned &ObjSize, unsigned &ObjIntRegs,
                           unsigned &ObjXMMRegs) {
  ObjSize = 0;
  ObjIntRegs = 0;
  ObjXMMRegs = 0;

  switch (ObjectVT) {
  default: assert(0 && "Unhandled argument type!");
  case MVT::i8:
  case MVT::i16:
  case MVT::i32:
  case MVT::i64:
    if (NumIntRegs < 6)
      ObjIntRegs = 1;
    else {
      switch (ObjectVT) {
      default: break;
      case MVT::i8:  ObjSize = 1; break;
      case MVT::i16: ObjSize = 2; break;
      case MVT::i32: ObjSize = 4; break;
      case MVT::i64: ObjSize = 8; break;
      }
    }
    break;
  case MVT::f32:
  case MVT::f64:
  case MVT::v16i8:
  case MVT::v8i16:
  case MVT::v4i32:
  case MVT::v2i64:
  case MVT::v4f32:
  case MVT::v2f64:
    if (NumXMMRegs < 8)
      ObjXMMRegs = 1;
    else {
      switch (ObjectVT) {