X86ISelLowering.cpp

  // Assign locations to each value returned by this call.
  SmallVector<CCValAssign, 16> RVLocs;
  bool isVarArg = TheCall->isVarArg();
  CCState CCInfo(CallingConv, isVarArg, getTargetMachine(), RVLocs);
  CCInfo.AnalyzeCallResult(TheCall, RetCC_X86);

  SmallVector<SDValue, 8> ResultVals;
  
  // Copy all of the result registers out of their specified physreg.
  for (unsigned i = 0; i != RVLocs.size(); ++i) {
    MVT CopyVT = RVLocs[i].getValVT();
    
    // If this is a call to a function that returns an fp value on the floating
    // point stack, but where we prefer to use the value in xmm registers, copy
    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
    if ((RVLocs[i].getLocReg() == X86::ST0 ||
         RVLocs[i].getLocReg() == X86::ST1) &&
        isScalarFPTypeInSSEReg(RVLocs[i].getValVT())) {
      CopyVT = MVT::f80;
    }
    
    Chain = DAG.getCopyFromReg(Chain, RVLocs[i].getLocReg(),
                               CopyVT, InFlag).getValue(1);
    SDValue Val = Chain.getValue(0);
    InFlag = Chain.getValue(2);

    if (CopyVT != RVLocs[i].getValVT()) {
      // Round the F80 the right size, which also moves to the appropriate xmm
      // register.
      Val = DAG.getNode(ISD::FP_ROUND, RVLocs[i].getValVT(), Val,
                        // This truncation won't change the value.
                        DAG.getIntPtrConstant(1));
    }
    
    ResultVals.push_back(Val);
  }

  // Merge everything together with a MERGE_VALUES node.
  ResultVals.push_back(Chain);
  return DAG.getMergeValues(TheCall->getVTList(), &ResultVals[0],
                            ResultVals.size()).getNode();
}


//===----------------------------------------------------------------------===//
//                C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
//  StdCall calling convention seems to be standard for many Windows' API
//  routines and around. It differs from C calling convention just a little:
//  callee should clean up the stack, not caller. Symbols should be also
//  decorated in some fancy way :) It doesn't support any vector arguments.
//  For info on fast calling convention see Fast Calling Convention (tail call)
//  implementation LowerX86_32FastCCCallTo.

/// AddLiveIn - This helper function adds the specified physical register to the
/// MachineFunction as a live in value.  It also creates a corresponding virtual
/// register for it.
static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
                          const TargetRegisterClass *RC) {
  assert(RC->contains(PReg) && "Not the correct regclass!");
  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
  MF.getRegInfo().addLiveIn(PReg, VReg);
  return VReg;
}

/// CallIsStructReturn - Determines whether a CALL node uses struct return
/// semantics.
static bool CallIsStructReturn(CallSDNode *TheCall) {
  unsigned NumOps = TheCall->getNumArgs();
  if (!NumOps)
    return false;

  return TheCall->getArgFlags(0).isSRet();
}

/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct
/// return semantics.
static bool ArgsAreStructReturn(SDValue Op) {
  unsigned NumArgs = Op.getNode()->getNumValues() - 1;
  if (!NumArgs)
    return false;

  return cast<ARG_FLAGSSDNode>(Op.getOperand(3))->getArgFlags().isSRet();
}

/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires
/// the callee to pop its own arguments. Callee pop is necessary to support tail
/// calls.
bool X86TargetLowering::IsCalleePop(bool IsVarArg, unsigned CallingConv) {
  if (IsVarArg)
    return false;

  switch (CallingConv) {
  default:
    return false;
  case CallingConv::X86_StdCall:
    return !Subtarget->is64Bit();
  case CallingConv::X86_FastCall:
    return !Subtarget->is64Bit();
  case CallingConv::Fast:
    return PerformTailCallOpt;
  }
}

/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
/// given CallingConvention value.
CCAssignFn *X86TargetLowering::CCAssignFnForNode(unsigned CC) const {
  if (Subtarget->is64Bit()) {
    if (Subtarget->isTargetWin64())
      return CC_X86_Win64_C;
    else if (CC == CallingConv::Fast && PerformTailCallOpt)
      return CC_X86_64_TailCall;
    else
      return CC_X86_64_C;
  }

  if (CC == CallingConv::X86_FastCall)
    return CC_X86_32_FastCall;
  else if (CC == CallingConv::Fast)
    return CC_X86_32_FastCC;
  else
    return CC_X86_32_C;
}

/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to
/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node.
NameDecorationStyle
X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDValue Op) {
  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
  if (CC == CallingConv::X86_FastCall)
    return FastCall;
  else if (CC == CallingConv::X86_StdCall)
    return StdCall;
  return None;
}


/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
/// in a register before calling.
bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) {
  return !IsTailCall && !Is64Bit &&
    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
    Subtarget->isPICStyleGOT();
}

/// CallRequiresFnAddressInReg - Check whether the call requires the function
/// address to be loaded in a register.
bool 
X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) {
  return !Is64Bit && IsTailCall &&  
    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
    Subtarget->isPICStyleGOT();
}

/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
/// by "Src" to address "Dst" with size and alignment information specified by
/// the specific parameter attribute. The copy will be passed as a byval
/// function parameter.
static SDValue 
CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG) {
  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
  return DAG.getMemcpy(Chain, Dst, Src, SizeNode, Flags.getByValAlign(),
                       /*AlwaysInline=*/true, NULL, 0, NULL, 0);
}

SDValue X86TargetLowering::LowerMemArgument(SDValue Op, SelectionDAG &DAG,
                                              const CCValAssign &VA,
                                              MachineFrameInfo *MFI,
                                              unsigned CC,
                                              SDValue Root, unsigned i) {
  // Create the nodes corresponding to a load from this parameter slot.
  ISD::ArgFlagsTy Flags =
    cast<ARG_FLAGSSDNode>(Op.getOperand(3 + i))->getArgFlags();
  bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt;
  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();

  // FIXME: For now, all byval parameter objects are marked mutable. This can be
  // changed with more analysis.  
  // In case of tail call optimization mark all arguments mutable. Since they
  // could be overwritten by lowering of arguments in case of a tail call.
  int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
                                  VA.getLocMemOffset(), isImmutable);
  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
  if (Flags.isByVal())
    return FIN;
  return DAG.getLoad(VA.getValVT(), Root, FIN,
                     PseudoSourceValue::getFixedStack(FI), 0);
}

SDValue
X86TargetLowering::LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  
  const Function* Fn = MF.getFunction();
  if (Fn->hasExternalLinkage() &&
      Subtarget->isTargetCygMing() &&
      Fn->getName() == "main")
    FuncInfo->setForceFramePointer(true);

  // Decorate the function name.
  FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op));
  
  MachineFrameInfo *MFI = MF.getFrameInfo();
  SDValue Root = Op.getOperand(0);
  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;
  unsigned CC = MF.getFunction()->getCallingConv();
  bool Is64Bit = Subtarget->is64Bit();
  bool IsWin64 = Subtarget->isTargetWin64();

  assert(!(isVarArg && CC == CallingConv::Fast) &&
         "Var args not supported with calling convention fastcc");

  // Assign locations to all of the incoming arguments.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
  CCInfo.AnalyzeFormalArguments(Op.getNode(), CCAssignFnForNode(CC));
  
  SmallVector<SDValue, 8> ArgValues;
  unsigned LastVal = ~0U;
  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
    // places.
    assert(VA.getValNo() != LastVal &&
           "Don't support value assigned to multiple locs yet");
    LastVal = VA.getValNo();
    
    if (VA.isRegLoc()) {
      MVT RegVT = VA.getLocVT();
      TargetRegisterClass *RC;
      if (RegVT == MVT::i32)
        RC = X86::GR32RegisterClass;
      else if (Is64Bit && RegVT == MVT::i64)
        RC = X86::GR64RegisterClass;
      else if (RegVT == MVT::f32)
        RC = X86::FR32RegisterClass;
      else if (RegVT == MVT::f64)
        RC = X86::FR64RegisterClass;
      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
        RC = X86::VR128RegisterClass;
      else if (RegVT.isVector()) {
        assert(RegVT.getSizeInBits() == 64);
        if (!Is64Bit)
          RC = X86::VR64RegisterClass;     // MMX values are passed in MMXs.
        else {
          // Darwin calling convention passes MMX values in either GPRs or
          // XMMs in x86-64. Other targets pass them in memory.
          if (RegVT != MVT::v1i64 && Subtarget->hasSSE2()) {
            RC = X86::VR128RegisterClass;  // MMX values are passed in XMMs.
            RegVT = MVT::v2i64;
          } else {
            RC = X86::GR64RegisterClass;   // v1i64 values are passed in GPRs.
            RegVT = MVT::i64;
          }
        }
      } else {
        assert(0 && "Unknown argument type!");
      }

      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
      SDValue ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
      
      // If this is an 8 or 16-bit value, it is really passed promoted to 32
      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
      // right size.
      if (VA.getLocInfo() == CCValAssign::SExt)
        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      else if (VA.getLocInfo() == CCValAssign::ZExt)
        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      
      if (VA.getLocInfo() != CCValAssign::Full)
        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
      
      // Handle MMX values passed in GPRs.
      if (Is64Bit && RegVT != VA.getLocVT()) {
        if (RegVT.getSizeInBits() == 64 && RC == X86::GR64RegisterClass)
          ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
        else if (RC == X86::VR128RegisterClass) {
          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, MVT::i64, ArgValue,
                                 DAG.getConstant(0, MVT::i64));
          ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
        }
      }
      
      ArgValues.push_back(ArgValue);
    } else {
      assert(VA.isMemLoc());
      ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i));
    }
  }

  // The x86-64 ABI for returning structs by value requires that we copy
  // the sret argument into %rax for the return. Save the argument into
  // a virtual register so that we can access it from the return points.
  if (Is64Bit && DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
    MachineFunction &MF = DAG.getMachineFunction();
    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
    unsigned Reg = FuncInfo->getSRetReturnReg();
    if (!Reg) {
      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
      FuncInfo->setSRetReturnReg(Reg);
    }
    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), Reg, ArgValues[0]);
    Root = DAG.getNode(ISD::TokenFactor, MVT::Other, Copy, Root);
  }

  unsigned StackSize = CCInfo.getNextStackOffset();
  // align stack specially for tail calls
  if (PerformTailCallOpt && CC == CallingConv::Fast)
    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

  // If the function takes variable number of arguments, make a frame index for
  // the start of the first vararg value... for expansion of llvm.va_start.
  if (isVarArg) {
    if (Is64Bit || CC != CallingConv::X86_FastCall) {
      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
    }
    if (Is64Bit) {
      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;

      // FIXME: We should really autogenerate these arrays
      static const unsigned GPR64ArgRegsWin64[] = {
        X86::RCX, X86::RDX, X86::R8,  X86::R9
      };
      static const unsigned XMMArgRegsWin64[] = {
        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
      };
      static const unsigned GPR64ArgRegs64Bit[] = {
        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
      };
      static const unsigned XMMArgRegs64Bit[] = {
        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
      };
      const unsigned *GPR64ArgRegs, *XMMArgRegs;

      if (IsWin64) {
        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
        GPR64ArgRegs = GPR64ArgRegsWin64;
        XMMArgRegs = XMMArgRegsWin64;
      } else {
        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
        GPR64ArgRegs = GPR64ArgRegs64Bit;
        XMMArgRegs = XMMArgRegs64Bit;
      }
      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
                                                       TotalNumIntRegs);
      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
                                                       TotalNumXMMRegs);

      // For X86-64, if there are vararg parameters that are passed via
      // registers, then we must store them to their spots on the stack so they
      // may be loaded by deferencing the result of va_next.
      VarArgsGPOffset = NumIntRegs * 8;
      VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
      RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
                                                 TotalNumXMMRegs * 16, 16);

      // Store the integer parameter registers.
      SmallVector<SDValue, 8> MemOps;
      SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
      SDValue FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
                                  DAG.getIntPtrConstant(VarArgsGPOffset));
      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
        unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
                                  X86::GR64RegisterClass);
        SDValue Val = DAG.getCopyFromReg(Root, VReg, MVT::i64);
        SDValue Store =
          DAG.getStore(Val.getValue(1), Val, FIN,
                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
        MemOps.push_back(Store);
        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
                          DAG.getIntPtrConstant(8));
      }

      // Now store the XMM (fp + vector) parameter registers.
      FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
                        DAG.getIntPtrConstant(VarArgsFPOffset));
      for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
        unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
                                  X86::VR128RegisterClass);
        SDValue Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32);
        SDValue Store =
          DAG.getStore(Val.getValue(1), Val, FIN,
                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex), 0);
        MemOps.push_back(Store);
        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
                          DAG.getIntPtrConstant(16));
      }
      if (!MemOps.empty())
          Root = DAG.getNode(ISD::TokenFactor, MVT::Other,
                             &MemOps[0], MemOps.size());
    }
  }
  
  ArgValues.push_back(Root);

  // Some CCs need callee pop.
  if (IsCalleePop(isVarArg, CC)) {
    BytesToPopOnReturn  = StackSize; // Callee pops everything.
    BytesCallerReserves = 0;
  } else {
    BytesToPopOnReturn  = 0; // Callee pops nothing.
    // If this is an sret function, the return should pop the hidden pointer.
    if (!Is64Bit && CC != CallingConv::Fast && ArgsAreStructReturn(Op))
      BytesToPopOnReturn = 4;  
    BytesCallerReserves = StackSize;
  }

  if (!Is64Bit) {
    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
    if (CC == CallingConv::X86_FastCall)
      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
  }

  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);

  // Return the new list of results.
  return DAG.getMergeValues(Op.getNode()->getVTList(), &ArgValues[0],
                            ArgValues.size()).getValue(Op.getResNo());
}

SDValue
X86TargetLowering::LowerMemOpCallTo(CallSDNode *TheCall, SelectionDAG &DAG,
                                    const SDValue &StackPtr,
                                    const CCValAssign &VA,
                                    SDValue Chain,
                                    SDValue Arg, ISD::ArgFlagsTy Flags) {
  unsigned LocMemOffset = VA.getLocMemOffset();
  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
  PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
  if (Flags.isByVal()) {
    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG);
  }
  return DAG.getStore(Chain, Arg, PtrOff,
                      PseudoSourceValue::getStack(), LocMemOffset);
}

/// EmitTailCallLoadRetAddr - Emit a load of return adress if tail call
/// optimization is performed and it is required.
SDValue 
X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, 
                                           SDValue &OutRetAddr,
                                           SDValue Chain, 
                                           bool IsTailCall, 
                                           bool Is64Bit, 
                                           int FPDiff) {
  if (!IsTailCall || FPDiff==0) return Chain;

  // Adjust the Return address stack slot.
  MVT VT = getPointerTy();
  OutRetAddr = getReturnAddressFrameIndex(DAG);
  // Load the "old" Return address.
  OutRetAddr = DAG.getLoad(VT, Chain,OutRetAddr, NULL, 0);
  return SDValue(OutRetAddr.getNode(), 1);
}

/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
/// optimization is performed and it is required (FPDiff!=0).
static SDValue 
EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF, 
                         SDValue Chain, SDValue RetAddrFrIdx,
                         bool Is64Bit, int FPDiff) {
  // Store the return address to the appropriate stack slot.
  if (!FPDiff) return Chain;
  // Calculate the new stack slot for the return address.
  int SlotSize = Is64Bit ? 8 : 4;
  int NewReturnAddrFI = 
    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
  MVT VT = Is64Bit ? MVT::i64 : MVT::i32;
  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
  Chain = DAG.getStore(Chain, RetAddrFrIdx, NewRetAddrFrIdx, 
                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
  return Chain;
}

SDValue X86TargetLowering::LowerCALL(SDValue Op, SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  CallSDNode *TheCall = cast<CallSDNode>(Op.getNode());
  SDValue Chain       = TheCall->getChain();
  unsigned CC         = TheCall->getCallingConv();
  bool isVarArg       = TheCall->isVarArg();
  bool IsTailCall     = TheCall->isTailCall() &&
                        CC == CallingConv::Fast && PerformTailCallOpt;
  SDValue Callee      = TheCall->getCallee();
  bool Is64Bit        = Subtarget->is64Bit();
  bool IsStructRet    = CallIsStructReturn(TheCall);

  assert(!(isVarArg && CC == CallingConv::Fast) &&
         "Var args not supported with calling convention fastcc");

  // Analyze operands of the call, assigning locations to each operand.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
  CCInfo.AnalyzeCallOperands(TheCall, CCAssignFnForNode(CC));
  
  // Get a count of how many bytes are to be pushed on the stack.
  unsigned NumBytes = CCInfo.getNextStackOffset();
  if (PerformTailCallOpt && CC == CallingConv::Fast)
    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

  int FPDiff = 0;
  if (IsTailCall) {
    // Lower arguments at fp - stackoffset + fpdiff.
    unsigned NumBytesCallerPushed = 
      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
    FPDiff = NumBytesCallerPushed - NumBytes;

    // Set the delta of movement of the returnaddr stackslot.
    // But only set if delta is greater than previous delta.
    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
  }

  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));

  SDValue RetAddrFrIdx;
  // Load return adress for tail calls.
  Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, IsTailCall, Is64Bit,
                                  FPDiff);

  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
  SmallVector<SDValue, 8> MemOpChains;
  SDValue StackPtr;

  // Walk the register/memloc assignments, inserting copies/loads.  In the case
  // of tail call optimization arguments are handle later.
  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
    SDValue Arg = TheCall->getArg(i);
    ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
    bool isByVal = Flags.isByVal();
  
    // Promote the value if needed.
    switch (VA.getLocInfo()) {
    default: assert(0 && "Unknown loc info!");
    case CCValAssign::Full: break;
    case CCValAssign::SExt:
      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
      break;
    case CCValAssign::ZExt:
      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
      break;
    case CCValAssign::AExt:
      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
      break;
    }
    
    if (VA.isRegLoc()) {
      if (Is64Bit) {
        MVT RegVT = VA.getLocVT();
        if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
          switch (VA.getLocReg()) {
          default:
            break;
          case X86::RDI: case X86::RSI: case X86::RDX: case X86::RCX:
          case X86::R8: {
            // Special case: passing MMX values in GPR registers.
            Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Arg);
            break;
          }
          case X86::XMM0: case X86::XMM1: case X86::XMM2: case X86::XMM3:
          case X86::XMM4: case X86::XMM5: case X86::XMM6: case X86::XMM7: {
            // Special case: passing MMX values in XMM registers.
            Arg = DAG.getNode(ISD::BIT_CONVERT, MVT::i64, Arg);
            Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, MVT::v2i64, Arg);
            Arg = DAG.getNode(ISD::VECTOR_SHUFFLE, MVT::v2i64,
                              DAG.getNode(ISD::UNDEF, MVT::v2i64), Arg,
                              getMOVLMask(2, DAG));
            break;
          }
          }
      }
      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    } else {
      if (!IsTailCall || (IsTailCall && isByVal)) {
        assert(VA.isMemLoc());
        if (StackPtr.getNode() == 0)
          StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy());
        
        MemOpChains.push_back(LowerMemOpCallTo(TheCall, DAG, StackPtr, VA,
                                               Chain, Arg, Flags));
      }
    }
  }
  
  if (!MemOpChains.empty())
    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                        &MemOpChains[0], MemOpChains.size());

  // Build a sequence of copy-to-reg nodes chained together with token chain
  // and flag operands which copy the outgoing args into registers.
  SDValue InFlag;
  // Tail call byval lowering might overwrite argument registers so in case of
  // tail call optimization the copies to registers are lowered later.
  if (!IsTailCall)
    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
      Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
                               InFlag);
      InFlag = Chain.getValue(1);
    }

  // ELF / PIC requires GOT in the EBX register before function calls via PLT
  // GOT pointer.  
  if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) {
    Chain = DAG.getCopyToReg(Chain, X86::EBX,
                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                             InFlag);
    InFlag = Chain.getValue(1);
  }
  // If we are tail calling and generating PIC/GOT style code load the address
  // of the callee into ecx. The value in ecx is used as target of the tail
  // jump. This is done to circumvent the ebx/callee-saved problem for tail
  // calls on PIC/GOT architectures. Normally we would just put the address of
  // GOT into ebx and then call target@PLT. But for tail callss ebx would be
  // restored (since ebx is callee saved) before jumping to the target@PLT.
  if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
    // Note: The actual moving to ecx is done further down.
    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
    if (G && !G->getGlobal()->hasHiddenVisibility() &&
        !G->getGlobal()->hasProtectedVisibility())
      Callee =  LowerGlobalAddress(Callee, DAG);
    else if (isa<ExternalSymbolSDNode>(Callee))
      Callee = LowerExternalSymbol(Callee,DAG);
  }

  if (Is64Bit && isVarArg) {
    // From AMD64 ABI document:
    // For calls that may call functions that use varargs or stdargs
    // (prototype-less calls or calls to functions containing ellipsis (...) in
    // the declaration) %al is used as hidden argument to specify the number
    // of SSE registers used. The contents of %al do not need to match exactly
    // the number of registers, but must be an ubound on the number of SSE
    // registers used and is in the range 0 - 8 inclusive.

    // FIXME: Verify this on Win64
    // Count the number of XMM registers allocated.
    static const unsigned XMMArgRegs[] = {
      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
    };
    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
    
    Chain = DAG.getCopyToReg(Chain, X86::AL,
                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
    InFlag = Chain.getValue(1);
  }


  // For tail calls lower the arguments to the 'real' stack slot.
  if (IsTailCall) {
    SmallVector<SDValue, 8> MemOpChains2;
    SDValue FIN;
    int FI = 0;
    // Do not flag preceeding copytoreg stuff together with the following stuff.
    InFlag = SDValue();
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
      if (!VA.isRegLoc()) {
        assert(VA.isMemLoc());
        SDValue Arg = TheCall->getArg(i);
        ISD::ArgFlagsTy Flags = TheCall->getArgFlags(i);
        // Create frame index.
        int32_t Offset = VA.getLocMemOffset()+FPDiff;
        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
        FIN = DAG.getFrameIndex(FI, getPointerTy());

        if (Flags.isByVal()) {
          // Copy relative to framepointer.
          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
          if (StackPtr.getNode() == 0)
            StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy());
          Source = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, Source);

          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain,
                                                           Flags, DAG));
        } else {
          // Store relative to framepointer.
          MemOpChains2.push_back(
            DAG.getStore(Chain, Arg, FIN,
                         PseudoSourceValue::getFixedStack(FI), 0));
        }            
      }
    }

    if (!MemOpChains2.empty())
      Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                          &MemOpChains2[0], MemOpChains2.size());

    // Copy arguments to their registers.
    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
      Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
                               InFlag);
      InFlag = Chain.getValue(1);
    }
    InFlag =SDValue();

    // Store the return address to the appropriate stack slot.
    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
                                     FPDiff);
  }

  // If the callee is a GlobalAddress node (quite common, every direct call is)
  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
    // We should use extra load for direct calls to dllimported functions in
    // non-JIT mode.
    if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
                                        getTargetMachine(), true))
      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy(),
                                          G->getOffset());
  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
  } else if (IsTailCall) {
    unsigned Opc = Is64Bit ? X86::R9 : X86::EAX;

    Chain = DAG.getCopyToReg(Chain, 
                             DAG.getRegister(Opc, getPointerTy()), 
                             Callee,InFlag);
    Callee = DAG.getRegister(Opc, getPointerTy());
    // Add register as live out.
    DAG.getMachineFunction().getRegInfo().addLiveOut(Opc);
  }
 
  // Returns a chain & a flag for retval copy to use.
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  SmallVector<SDValue, 8> Ops;

  if (IsTailCall) {
    Ops.push_back(Chain);
    Ops.push_back(DAG.getIntPtrConstant(NumBytes, true));
    Ops.push_back(DAG.getIntPtrConstant(0, true));
    if (InFlag.getNode())
      Ops.push_back(InFlag);
    Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
    InFlag = Chain.getValue(1);
 
    // Returns a chain & a flag for retval copy to use.
    NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
    Ops.clear();
  }
  
  Ops.push_back(Chain);
  Ops.push_back(Callee);

  if (IsTailCall)
    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));
  
  // Add an implicit use GOT pointer in EBX.
  if (!IsTailCall && !Is64Bit &&
      getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      Subtarget->isPICStyleGOT())
    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));

  // Add an implicit use of AL for x86 vararg functions.
  if (Is64Bit && isVarArg)
    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));

  if (InFlag.getNode())
    Ops.push_back(InFlag);

  if (IsTailCall) {
    assert(InFlag.getNode() && 
           "Flag must be set. Depend on flag being set in LowerRET");
    Chain = DAG.getNode(X86ISD::TAILCALL,
                        TheCall->getVTList(), &Ops[0], Ops.size());
      
    return SDValue(Chain.getNode(), Op.getResNo());
  }

  Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size());
  InFlag = Chain.getValue(1);

  // Create the CALLSEQ_END node.
  unsigned NumBytesForCalleeToPush;
  if (IsCalleePop(isVarArg, CC))
    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
  else if (!Is64Bit && CC != CallingConv::Fast && IsStructRet)
    // If this is is a call to a struct-return function, the callee
    // pops the hidden struct pointer, so we have to push it back.
    // This is common for Darwin/X86, Linux & Mingw32 targets.
    NumBytesForCalleeToPush = 4;
  else
    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
  
  // Returns a flag for retval copy to use.
  Chain = DAG.getCALLSEQ_END(Chain,
                             DAG.getIntPtrConstant(NumBytes, true),
                             DAG.getIntPtrConstant(NumBytesForCalleeToPush,
                                                   true),
                             InFlag);
  InFlag = Chain.getValue(1);

  // Handle result values, copying them out of physregs into vregs that we
  // return.
  return SDValue(LowerCallResult(Chain, InFlag, TheCall, CC, DAG),
                 Op.getResNo());
}


//===----------------------------------------------------------------------===//
//                Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//

//  Like std call, callee cleans arguments, convention except that ECX is
//  reserved for storing the tail called function address. Only 2 registers are
//  free for argument passing (inreg). Tail call optimization is performed
//  provided:
//                * tailcallopt is enabled
//                * caller/callee are fastcc
//  On X86_64 architecture with GOT-style position independent code only local
//  (within module) calls are supported at the moment.
//  To keep the stack aligned according to platform abi the function
//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
//  If a tail called function callee has more arguments than the caller the
//  caller needs to make sure that there is room to move the RETADDR to. This is
//  achieved by reserving an area the size of the argument delta right after the
//  original REtADDR, but before the saved framepointer or the spilled registers
//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
//  stack layout:
//    arg1
//    arg2
//    RETADDR
//    [ new RETADDR 
//      move area ]
//    (possible EBP)
//    ESI
//    EDI
//    local1 ..

/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
/// for a 16 byte align requirement.
unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 
                                                        SelectionDAG& DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  const TargetMachine &TM = MF.getTarget();
  const TargetFrameInfo &TFI = *TM.getFrameInfo();
  unsigned StackAlignment = TFI.getStackAlignment();
  uint64_t AlignMask = StackAlignment - 1; 
  int64_t Offset = StackSize;
  uint64_t SlotSize = TD->getPointerSize();
  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
    // Number smaller than 12 so just add the difference.
    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
  } else {
    // Mask out lower bits, add stackalignment once plus the 12 bytes.
    Offset = ((~AlignMask) & Offset) + StackAlignment + 
      (StackAlignment-SlotSize);
  }
  return Offset;
}

/// IsEligibleForTailCallElimination - Check to see whether the next instruction
/// following the call is a return. A function is eligible if caller/callee
/// calling conventions match, currently only fastcc supports tail calls, and
/// the function CALL is immediatly followed by a RET.
bool X86TargetLowering::IsEligibleForTailCallOptimization(CallSDNode *TheCall,
                                                      SDValue Ret,
                                                      SelectionDAG& DAG) const {
  if (!PerformTailCallOpt)
    return false;

  if (CheckTailCallReturnConstraints(TheCall, Ret)) {
    MachineFunction &MF = DAG.getMachineFunction();
    unsigned CallerCC = MF.getFunction()->getCallingConv();
    unsigned CalleeCC= TheCall->getCallingConv();
    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
      SDValue Callee = TheCall->getCallee();
      // On x86/32Bit PIC/GOT  tail calls are supported.
      if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
          !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
        return true;

      // Can only do local tail calls (in same module, hidden or protected) on
      // x86_64 PIC/GOT at the moment.
      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
        return G->getGlobal()->hasHiddenVisibility()
            || G->getGlobal()->hasProtectedVisibility();
    }
  }

  return false;
}

FastISel *
X86TargetLowering::createFastISel(MachineFunction &mf,
                                  MachineModuleInfo *mmo,
                                  DenseMap<const Value *, unsigned> &vm,
                                  DenseMap<const BasicBlock *,
                                           MachineBasicBlock *> &bm,
                                  DenseMap<const AllocaInst *, int> &am
#ifndef NDEBUG
                                  , SmallSet<Instruction*, 8> &cil
#endif
                                  ) {
  return X86::createFastISel(mf, mmo, vm, bm, am
#ifndef NDEBUG
                             , cil
#endif
                             );
}


//===----------------------------------------------------------------------===//
//                           Other Lowering Hooks
//===----------------------------------------------------------------------===//


SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  int ReturnAddrIndex = FuncInfo->getRAIndex();
  uint64_t SlotSize = TD->getPointerSize();

  if (ReturnAddrIndex == 0) {
    // Set up a frame object for the return address.
    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize);
    FuncInfo->setRAIndex(ReturnAddrIndex);
  }

  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
}


/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code. It returns a false if it cannot do a direct
/// translation. X86CC is the translated CondCode.  LHS/RHS are modified as
/// needed.
static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
                           unsigned &X86CC, SDValue &LHS, SDValue &RHS,
                           SelectionDAG &DAG) {
  X86CC = X86::COND_INVALID;
  if (!isFP) {
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
        // X > -1   -> X == 0, jump !sign.
        RHS = DAG.getConstant(0, RHS.getValueType());
        X86CC = X86::COND_NS;
        return true;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
        // X < 0   -> X == 0, jump on sign.
        X86CC = X86::COND_S;
        return true;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
        // X < 1   -> X <= 0
        RHS = DAG.getConstant(0, RHS.getValueType());
        X86CC = X86::COND_LE;
        return true;
      }
    }

    switch (SetCCOpcode) {
    default: break;
    case ISD::SETEQ:  X86CC = X86::COND_E;  break;
    case ISD::SETGT:  X86CC = X86::COND_G;  break;
    case ISD::SETGE:  X86CC = X86::COND_GE; break;
    case ISD::SETLT:  X86CC = X86::COND_L;  break;
    case ISD::SETLE:  X86CC = X86::COND_LE; break;
    case ISD::SETNE:  X86CC = X86::COND_NE; break;
    case ISD::SETULT: X86CC = X86::COND_B;  break;
    case ISD::SETUGT: X86CC = X86::COND_A;  break;
    case ISD::SETULE: X86CC = X86::COND_BE; break;
    case ISD::SETUGE: X86CC = X86::COND_AE; break;
    }
  } else {
    // First determine if it is required or is profitable to flip the operands.

    // If LHS is a foldable load, but RHS is not, flip the condition.
    if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
        !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
      SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
      std::swap(LHS, RHS);
    }

    switch (SetCCOpcode) {
    default: break;
    case ISD::SETOLT:
    case ISD::SETOLE:
    case ISD::SETUGT:
    case ISD::SETUGE:
      std::swap(LHS, RHS);
      break;
    }

    // On a floating point condition, the flags are set as follows:
    // ZF  PF  CF   op
    //  0 | 0 | 0 | X > Y
    //  0 | 0 | 1 | X < Y