X86ISelLowering.cpp

  ConstantSDNode *Flags = cast<ConstantSDNode>(Op.getOperand(6));
  return Flags->getValue() & ISD::ParamFlags::StructReturn;
}

/// ArgsAreStructReturn - Determines whether a FORMAL_ARGUMENTS node uses struct
/// return semantics.
static bool ArgsAreStructReturn(SDOperand Op) {
  unsigned NumArgs = Op.Val->getNumValues() - 1;
  if (!NumArgs)
    return false;
  
  ConstantSDNode *Flags = cast<ConstantSDNode>(Op.getOperand(3));
  return Flags->getValue() & ISD::ParamFlags::StructReturn;
}

/// IsCalleePop - Determines whether a CALL or FORMAL_ARGUMENTS node requires the
/// callee to pop its own arguments. Callee pop is necessary to support tail
/// calls.
bool X86TargetLowering::IsCalleePop(SDOperand Op) {
  bool IsVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
  if (IsVarArg)
    return false;

  switch (cast<ConstantSDNode>(Op.getOperand(1))->getValue()) {
  default:
    return false;
  case CallingConv::X86_StdCall:
    return !Subtarget->is64Bit();
  case CallingConv::X86_FastCall:
    return !Subtarget->is64Bit();
  case CallingConv::Fast:
    return PerformTailCallOpt;
  }
}

/// CCAssignFnForNode - Selects the correct CCAssignFn for a CALL or
/// FORMAL_ARGUMENTS node.
CCAssignFn *X86TargetLowering::CCAssignFnForNode(SDOperand Op) const {
  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
  
  if (Subtarget->is64Bit()) {
    if (CC == CallingConv::Fast && PerformTailCallOpt)
      return CC_X86_64_TailCall;
    else
      return CC_X86_64_C;
  }

  if (CC == CallingConv::X86_FastCall)
    return CC_X86_32_FastCall;
  else if (CC == CallingConv::Fast && PerformTailCallOpt)
    return CC_X86_32_TailCall;
  else
    return CC_X86_32_C;
}

/// NameDecorationForFORMAL_ARGUMENTS - Selects the appropriate decoration to
/// apply to a MachineFunction containing a given FORMAL_ARGUMENTS node.
NameDecorationStyle
X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDOperand Op) {
  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
  if (CC == CallingConv::X86_FastCall)
    return FastCall;
  else if (CC == CallingConv::X86_StdCall)
    return StdCall;
  return None;
}

/// IsPossiblyOverwrittenArgumentOfTailCall - Check if the operand could
/// possibly be overwritten when lowering the outgoing arguments in a tail
/// call. Currently the implementation of this call is very conservative and
/// assumes all arguments sourcing from FORMAL_ARGUMENTS or a CopyFromReg with
/// virtual registers would be overwritten by direct lowering.
static bool IsPossiblyOverwrittenArgumentOfTailCall(SDOperand Op, 
                                                    MachineFrameInfo * MFI) {
  RegisterSDNode * OpReg = NULL;
  FrameIndexSDNode * FrameIdxNode = NULL;
  int FrameIdx = 0;
  if (Op.getOpcode() == ISD::FORMAL_ARGUMENTS ||
      (Op.getOpcode()== ISD::CopyFromReg &&
       (OpReg = dyn_cast<RegisterSDNode>(Op.getOperand(1))) &&
       (OpReg->getReg() >= TargetRegisterInfo::FirstVirtualRegister)) ||
      (Op.getOpcode() == ISD::LOAD &&
       (FrameIdxNode = dyn_cast<FrameIndexSDNode>(Op.getOperand(1))) &&
       (MFI->isFixedObjectIndex((FrameIdx = FrameIdxNode->getIndex()))) &&
       (MFI->getObjectOffset(FrameIdx) >= 0)))
    return true;
  return false;
}

/// CallRequiresGOTInRegister - Check whether the call requires the GOT pointer
/// in a register before calling.
bool X86TargetLowering::CallRequiresGOTPtrInReg(bool Is64Bit, bool IsTailCall) {
  return !IsTailCall && !Is64Bit &&
    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
    Subtarget->isPICStyleGOT();
}


/// CallRequiresFnAddressInReg - Check whether the call requires the function
/// address to be loaded in a register.
bool 
X86TargetLowering::CallRequiresFnAddressInReg(bool Is64Bit, bool IsTailCall) {
  return !Is64Bit && IsTailCall &&  
    getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
    Subtarget->isPICStyleGOT();
}

/// CopyTailCallClobberedArgumentsToVRegs - Create virtual registers for all
/// arguments to force loading and guarantee that arguments sourcing from
/// incomming parameters are not overwriting each other.
static SDOperand 
CopyTailCallClobberedArgumentsToVRegs(SDOperand Chain,
     SmallVector<std::pair<unsigned, SDOperand>, 8> &TailCallClobberedVRegs,
                                      SelectionDAG &DAG,
                                      MachineFunction &MF,
                                      const TargetLowering * TL) {
      
  SDOperand InFlag;
  for (unsigned i = 0, e = TailCallClobberedVRegs.size(); i != e; i++) {
    SDOperand Arg = TailCallClobberedVRegs[i].second;
    unsigned Idx = TailCallClobberedVRegs[i].first;
    unsigned VReg = 
      MF.getRegInfo().
      createVirtualRegister(TL->getRegClassFor(Arg.getValueType()));
    Chain = DAG.getCopyToReg(Chain, VReg, Arg, InFlag);
    InFlag = Chain.getValue(1);
    Arg = DAG.getCopyFromReg(Chain, VReg, Arg.getValueType(), InFlag);
    TailCallClobberedVRegs[i] = std::make_pair(Idx, Arg);
    Chain = Arg.getValue(1);
    InFlag = Arg.getValue(2);
  }
  return Chain;
} 

/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
/// by "Src" to address "Dst" with size and alignment information specified by
/// the specific parameter attribute. The copy will be passed as a byval function
/// parameter.
static SDOperand 
CreateCopyOfByValArgument(SDOperand Src, SDOperand Dst, SDOperand Chain,
                          unsigned Flags, SelectionDAG &DAG) {
  unsigned Align = 1 <<
    ((Flags & ISD::ParamFlags::ByValAlign) >> ISD::ParamFlags::ByValAlignOffs);
  unsigned Size = (Flags & ISD::ParamFlags::ByValSize) >>
    ISD::ParamFlags::ByValSizeOffs;
  SDOperand AlignNode    = DAG.getConstant(Align, MVT::i32);
  SDOperand SizeNode     = DAG.getConstant(Size, MVT::i32);
  SDOperand AlwaysInline = DAG.getConstant(1, MVT::i32);
  return DAG.getMemcpy(Chain, Dst, Src, SizeNode, AlignNode, AlwaysInline);
}

SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG,
                                              const CCValAssign &VA,
                                              MachineFrameInfo *MFI,
                                              unsigned CC,
                                              SDOperand Root, unsigned i) {
  // Create the nodes corresponding to a load from this parameter slot.
  unsigned Flags = cast<ConstantSDNode>(Op.getOperand(3 + i))->getValue();
  bool AlwaysUseMutable = (CC==CallingConv::Fast) && PerformTailCallOpt;
  bool isByVal = Flags & ISD::ParamFlags::ByVal;
  bool isImmutable = !AlwaysUseMutable && !isByVal;

  // FIXME: For now, all byval parameter objects are marked mutable. This can be
  // changed with more analysis.  
  // In case of tail call optimization mark all arguments mutable. Since they
  // could be overwritten by lowering of arguments in case of a tail call.
  int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
                                  VA.getLocMemOffset(), isImmutable);
  SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
  if (isByVal)
    return FIN;
  return DAG.getLoad(VA.getValVT(), Root, FIN,
                     PseudoSourceValue::getFixedStack(), FI);
}

SDOperand
X86TargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  
  const Function* Fn = MF.getFunction();
  if (Fn->hasExternalLinkage() &&
      Subtarget->isTargetCygMing() &&
      Fn->getName() == "main")
    FuncInfo->setForceFramePointer(true);

  // Decorate the function name.
  FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op));
  
  MachineFrameInfo *MFI = MF.getFrameInfo();
  SDOperand Root = Op.getOperand(0);
  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
  unsigned CC = MF.getFunction()->getCallingConv();
  bool Is64Bit = Subtarget->is64Bit();

  assert(!(isVarArg && CC == CallingConv::Fast) &&
         "Var args not supported with calling convention fastcc");

  // Assign locations to all of the incoming arguments.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
  CCInfo.AnalyzeFormalArguments(Op.Val, CCAssignFnForNode(Op));
  
  SmallVector<SDOperand, 8> ArgValues;
  unsigned LastVal = ~0U;
  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
    // places.
    assert(VA.getValNo() != LastVal &&
           "Don't support value assigned to multiple locs yet");
    LastVal = VA.getValNo();
    
    if (VA.isRegLoc()) {
      MVT::ValueType RegVT = VA.getLocVT();
      TargetRegisterClass *RC;
      if (RegVT == MVT::i32)
        RC = X86::GR32RegisterClass;
      else if (Is64Bit && RegVT == MVT::i64)
        RC = X86::GR64RegisterClass;
      else if (RegVT == MVT::f32)
        RC = X86::FR32RegisterClass;
      else if (RegVT == MVT::f64)
        RC = X86::FR64RegisterClass;
      else {
        assert(MVT::isVector(RegVT));
        if (Is64Bit && MVT::getSizeInBits(RegVT) == 64) {
          RC = X86::GR64RegisterClass;       // MMX values are passed in GPRs.
          RegVT = MVT::i64;
        } else
          RC = X86::VR128RegisterClass;
      }

      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
      
      // If this is an 8 or 16-bit value, it is really passed promoted to 32
      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
      // right size.
      if (VA.getLocInfo() == CCValAssign::SExt)
        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      else if (VA.getLocInfo() == CCValAssign::ZExt)
        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      
      if (VA.getLocInfo() != CCValAssign::Full)
        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
      
      // Handle MMX values passed in GPRs.
      if (Is64Bit && RegVT != VA.getLocVT() && RC == X86::GR64RegisterClass &&
          MVT::getSizeInBits(RegVT) == 64)
        ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
      
      ArgValues.push_back(ArgValue);
    } else {
      assert(VA.isMemLoc());
      ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, CC, Root, i));
    }
  }

  unsigned StackSize = CCInfo.getNextStackOffset();
  // align stack specially for tail calls
  if (CC == CallingConv::Fast)
    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

  // If the function takes variable number of arguments, make a frame index for
  // the start of the first vararg value... for expansion of llvm.va_start.
  if (isVarArg) {
    if (Is64Bit || CC != CallingConv::X86_FastCall) {
      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
    }
    if (Is64Bit) {
      static const unsigned GPR64ArgRegs[] = {
        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8,  X86::R9
      };
      static const unsigned XMMArgRegs[] = {
        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
      };
      
      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);
      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
    
      // For X86-64, if there are vararg parameters that are passed via
      // registers, then we must store them to their spots on the stack so they
      // may be loaded by deferencing the result of va_next.
      VarArgsGPOffset = NumIntRegs * 8;
      VarArgsFPOffset = 6 * 8 + NumXMMRegs * 16;
      RegSaveFrameIndex = MFI->CreateStackObject(6 * 8 + 8 * 16, 16);
      
      // Store the integer parameter registers.
      SmallVector<SDOperand, 8> MemOps;
      SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
      SDOperand FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
                                  DAG.getIntPtrConstant(VarArgsGPOffset));
      for (; NumIntRegs != 6; ++NumIntRegs) {
        unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
                                  X86::GR64RegisterClass);
        SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i64);
        SDOperand Store =
          DAG.getStore(Val.getValue(1), Val, FIN,
                       PseudoSourceValue::getFixedStack(),
                       RegSaveFrameIndex);
        MemOps.push_back(Store);
        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
                          DAG.getIntPtrConstant(8));
      }
      
      // Now store the XMM (fp + vector) parameter registers.
      FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
                        DAG.getIntPtrConstant(VarArgsFPOffset));
      for (; NumXMMRegs != 8; ++NumXMMRegs) {
        unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
                                  X86::VR128RegisterClass);
        SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32);
        SDOperand Store =
          DAG.getStore(Val.getValue(1), Val, FIN,
                       PseudoSourceValue::getFixedStack(),
                       RegSaveFrameIndex);
        MemOps.push_back(Store);
        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
                          DAG.getIntPtrConstant(16));
      }
      if (!MemOps.empty())
          Root = DAG.getNode(ISD::TokenFactor, MVT::Other,
                             &MemOps[0], MemOps.size());
    }
  }
  
  // Make sure the instruction takes 8n+4 bytes to make sure the start of the
  // arguments and the arguments after the retaddr has been pushed are
  // aligned.
  if (!Is64Bit && CC == CallingConv::X86_FastCall &&
      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows() &&
      (StackSize & 7) == 0)
    StackSize += 4;

  ArgValues.push_back(Root);

  // Some CCs need callee pop.
  if (IsCalleePop(Op)) {
    BytesToPopOnReturn  = StackSize; // Callee pops everything.
    BytesCallerReserves = 0;
  } else {
    BytesToPopOnReturn  = 0; // Callee pops nothing.
    // If this is an sret function, the return should pop the hidden pointer.
    if (!Is64Bit && ArgsAreStructReturn(Op))
      BytesToPopOnReturn = 4;  
    BytesCallerReserves = StackSize;
  }

  if (!Is64Bit) {
    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
    if (CC == CallingConv::X86_FastCall)
      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
  }

  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);

  // Return the new list of results.
  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
}

SDOperand
X86TargetLowering::LowerMemOpCallTo(SDOperand Op, SelectionDAG &DAG,
                                    const SDOperand &StackPtr,
                                    const CCValAssign &VA,
                                    SDOperand Chain,
                                    SDOperand Arg) {
  unsigned LocMemOffset = VA.getLocMemOffset();
  SDOperand PtrOff = DAG.getIntPtrConstant(LocMemOffset);
  PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
  SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo());
  unsigned Flags    = cast<ConstantSDNode>(FlagsOp)->getValue();
  if (Flags & ISD::ParamFlags::ByVal) {
    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG);
  }
  return DAG.getStore(Chain, Arg, PtrOff,
                      PseudoSourceValue::getStack(), LocMemOffset);
}

/// ClassifyX86_64SRetCallReturn - Classify how to implement a x86-64
/// struct return call to the specified function. X86-64 ABI specifies
/// some SRet calls are actually returned in registers. Since current
/// LLVM cannot represent multi-value calls, they are represent as 
/// calls where the results are passed in a hidden struct provided by
/// the caller. This function examines the type of the struct to
/// determine the correct way to implement the call.
X86::X86_64SRet
X86TargetLowering::ClassifyX86_64SRetCallReturn(const Function *Fn) {
  // FIXME: Disabled for now.
  return X86::InMemory;

  const PointerType *PTy = cast<PointerType>(Fn->arg_begin()->getType());
  const Type *RTy = PTy->getElementType();
  unsigned Size = getTargetData()->getABITypeSize(RTy);
  if (Size != 16 && Size != 32)
    return X86::InMemory;

  if (Size == 32) {
    const StructType *STy = dyn_cast<StructType>(RTy);
    if (!STy) return X86::InMemory;
    if (STy->getNumElements() == 2 &&
        STy->getElementType(0) == Type::X86_FP80Ty &&
        STy->getElementType(1) == Type::X86_FP80Ty)
      return X86::InX87;
  }

  bool AllFP = true;
  for (Type::subtype_iterator I = RTy->subtype_begin(), E = RTy->subtype_end();
       I != E; ++I) {
    const Type *STy = I->get();
    if (!STy->isFPOrFPVector()) {
      AllFP = false;
      break;
    }
  }

  if (AllFP)
    return X86::InSSE;
  return X86::InGPR64;
}

void X86TargetLowering::X86_64AnalyzeSRetCallOperands(SDNode *TheCall,
                                                      CCAssignFn *Fn,
                                                      CCState &CCInfo) {
  unsigned NumOps = (TheCall->getNumOperands() - 5) / 2;
  for (unsigned i = 1; i != NumOps; ++i) {
    MVT::ValueType ArgVT = TheCall->getOperand(5+2*i).getValueType();
    SDOperand FlagOp = TheCall->getOperand(5+2*i+1);
    unsigned ArgFlags =cast<ConstantSDNode>(FlagOp)->getValue();
    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo)) {
      cerr << "Call operand #" << i << " has unhandled type "
           << MVT::getValueTypeString(ArgVT) << "\n";
      abort();
    }
  }
}

SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  MachineFrameInfo * MFI = MF.getFrameInfo();
  SDOperand Chain     = Op.getOperand(0);
  unsigned CC         = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
  bool IsTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0
                        && CC == CallingConv::Fast && PerformTailCallOpt;
  SDOperand Callee    = Op.getOperand(4);
  bool Is64Bit        = Subtarget->is64Bit();
  bool IsStructRet    = CallIsStructReturn(Op);

  assert(!(isVarArg && CC == CallingConv::Fast) &&
         "Var args not supported with calling convention fastcc");

  // Analyze operands of the call, assigning locations to each operand.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
  CCAssignFn *CCFn = CCAssignFnForNode(Op);

  X86::X86_64SRet SRetMethod = X86::InMemory;
  if (Is64Bit && IsStructRet)
    // FIXME: We can't figure out type of the sret structure for indirect
    // calls. We need to copy more information from CallSite to the ISD::CALL
    // node.
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
      SRetMethod =
        ClassifyX86_64SRetCallReturn(dyn_cast<Function>(G->getGlobal()));

  // UGLY HACK! For x86-64, some 128-bit aggregates are returns in a pair of
  // registers. Unfortunately, llvm does not support i128 yet so we pretend it's
  // a sret call.
  if (SRetMethod != X86::InMemory)
    X86_64AnalyzeSRetCallOperands(Op.Val, CCFn, CCInfo);
  else 
    CCInfo.AnalyzeCallOperands(Op.Val, CCFn);
  
  // Get a count of how many bytes are to be pushed on the stack.
  unsigned NumBytes = CCInfo.getNextStackOffset();
  if (CC == CallingConv::Fast)
    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

  // Make sure the instruction takes 8n+4 bytes to make sure the start of the
  // arguments and the arguments after the retaddr has been pushed are aligned.
  if (!Is64Bit && CC == CallingConv::X86_FastCall &&
      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows() &&
      (NumBytes & 7) == 0)
    NumBytes += 4;

  int FPDiff = 0;
  if (IsTailCall) {
    // Lower arguments at fp - stackoffset + fpdiff.
    unsigned NumBytesCallerPushed = 
      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
    FPDiff = NumBytesCallerPushed - NumBytes;

    // Set the delta of movement of the returnaddr stackslot.
    // But only set if delta is greater than previous delta.
    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
  }

  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes));

  SDOperand RetAddrFrIdx;
  if (IsTailCall) {
    // Adjust the Return address stack slot.
    if (FPDiff) {
      MVT::ValueType VT = Is64Bit ? MVT::i64 : MVT::i32;
      RetAddrFrIdx = getReturnAddressFrameIndex(DAG);
      // Load the "old" Return address.
      RetAddrFrIdx = 
        DAG.getLoad(VT, Chain,RetAddrFrIdx, NULL, 0);
      Chain = SDOperand(RetAddrFrIdx.Val, 1);
    }
  }

  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
  SmallVector<std::pair<unsigned, SDOperand>, 8> TailCallClobberedVRegs;
  SmallVector<SDOperand, 8> MemOpChains;

  SDOperand StackPtr;

  // Walk the register/memloc assignments, inserting copies/loads.  For tail
  // calls, remember all arguments for later special lowering.
  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
    
    // Promote the value if needed.
    switch (VA.getLocInfo()) {
    default: assert(0 && "Unknown loc info!");
    case CCValAssign::Full: break;
    case CCValAssign::SExt:
      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
      break;
    case CCValAssign::ZExt:
      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
      break;
    case CCValAssign::AExt:
      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
      break;
    }
    
    if (VA.isRegLoc()) {
      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    } else {
      if (!IsTailCall) {
        assert(VA.isMemLoc());
        if (StackPtr.Val == 0)
          StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy());
        
        MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain,
                                               Arg));
      } else if (IsPossiblyOverwrittenArgumentOfTailCall(Arg, MFI)) {
        TailCallClobberedVRegs.push_back(std::make_pair(i,Arg));
      }
    }
  }
  
  if (!MemOpChains.empty())
    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                        &MemOpChains[0], MemOpChains.size());

  // Build a sequence of copy-to-reg nodes chained together with token chain
  // and flag operands which copy the outgoing args into registers.
  SDOperand InFlag;
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
                             InFlag);
    InFlag = Chain.getValue(1);
  }

  // ELF / PIC requires GOT in the EBX register before function calls via PLT
  // GOT pointer.  
  if (CallRequiresGOTPtrInReg(Is64Bit, IsTailCall)) {
    Chain = DAG.getCopyToReg(Chain, X86::EBX,
                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                             InFlag);
    InFlag = Chain.getValue(1);
  }
  // If we are tail calling and generating PIC/GOT style code load the address
  // of the callee into ecx. The value in ecx is used as target of the tail
  // jump. This is done to circumvent the ebx/callee-saved problem for tail
  // calls on PIC/GOT architectures. Normally we would just put the address of
  // GOT into ebx and then call target@PLT. But for tail callss ebx would be
  // restored (since ebx is callee saved) before jumping to the target@PLT.
  if (CallRequiresFnAddressInReg(Is64Bit, IsTailCall)) {
    // Note: The actual moving to ecx is done further down.
    GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
    if (G &&  !G->getGlobal()->hasHiddenVisibility() &&
        !G->getGlobal()->hasProtectedVisibility())
      Callee =  LowerGlobalAddress(Callee, DAG);
    else if (isa<ExternalSymbolSDNode>(Callee))
      Callee = LowerExternalSymbol(Callee,DAG);
  }

  if (Is64Bit && isVarArg) {
    // From AMD64 ABI document:
    // For calls that may call functions that use varargs or stdargs
    // (prototype-less calls or calls to functions containing ellipsis (...) in
    // the declaration) %al is used as hidden argument to specify the number
    // of SSE registers used. The contents of %al do not need to match exactly
    // the number of registers, but must be an ubound on the number of SSE
    // registers used and is in the range 0 - 8 inclusive.
    
    // Count the number of XMM registers allocated.
    static const unsigned XMMArgRegs[] = {
      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
    };
    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
    
    Chain = DAG.getCopyToReg(Chain, X86::AL,
                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
    InFlag = Chain.getValue(1);
  }


  // For tail calls lower the arguments to the 'real' stack slot.
  if (IsTailCall) {
    SmallVector<SDOperand, 8> MemOpChains2;
    SDOperand FIN;
    int FI = 0;
    // Do not flag preceeding copytoreg stuff together with the following stuff.
    InFlag = SDOperand();
    
    Chain = CopyTailCallClobberedArgumentsToVRegs(Chain, TailCallClobberedVRegs,
                                                  DAG, MF, this);
 
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
      if (!VA.isRegLoc()) {
        assert(VA.isMemLoc());
        SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
        SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo());
        unsigned Flags    = cast<ConstantSDNode>(FlagsOp)->getValue();
        // Create frame index.
        int32_t Offset = VA.getLocMemOffset()+FPDiff;
        uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
        FIN = DAG.getFrameIndex(FI, MVT::i32);

        // Find virtual register for this argument.
        bool Found=false;
        for (unsigned idx=0, e= TailCallClobberedVRegs.size(); idx < e; idx++)
          if (TailCallClobberedVRegs[idx].first==i) {
            Arg = TailCallClobberedVRegs[idx].second;
            Found=true;
            break;
          }
        assert(IsPossiblyOverwrittenArgumentOfTailCall(Arg, MFI)==false || 
          (Found==true && "No corresponding Argument was found"));
        
        if (Flags & ISD::ParamFlags::ByVal) {
          // Copy relative to framepointer.
          MemOpChains2.push_back(CreateCopyOfByValArgument(Arg, FIN, Chain,
                                                           Flags, DAG));
        } else {
          // Store relative to framepointer.
          MemOpChains2.push_back(
            DAG.getStore(Chain, Arg, FIN,
                         PseudoSourceValue::getFixedStack(), FI));
        }            
      }
    }

    if (!MemOpChains2.empty())
      Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                          &MemOpChains2[0], MemOpChains2.size());

    // Store the return address to the appropriate stack slot.
    if (FPDiff) {
      // Calculate the new stack slot for the return address.
      int SlotSize = Is64Bit ? 8 : 4;
      int NewReturnAddrFI = 
        MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
      MVT::ValueType VT = Is64Bit ? MVT::i64 : MVT::i32;
      SDOperand NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
      Chain = DAG.getStore(Chain, RetAddrFrIdx, NewRetAddrFrIdx, 
                           PseudoSourceValue::getFixedStack(), NewReturnAddrFI);
    }
  }

  // If the callee is a GlobalAddress node (quite common, every direct call is)
  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
    // We should use extra load for direct calls to dllimported functions in
    // non-JIT mode.
    if ((IsTailCall || !Is64Bit ||
         getTargetMachine().getCodeModel() != CodeModel::Large)
        && !Subtarget->GVRequiresExtraLoad(G->getGlobal(),
                                           getTargetMachine(), true))
      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
    if (IsTailCall || !Is64Bit ||
        getTargetMachine().getCodeModel() != CodeModel::Large)
      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
  } else if (IsTailCall) {
    unsigned Opc = Is64Bit ? X86::R9 : X86::ECX;

    Chain = DAG.getCopyToReg(Chain, 
                             DAG.getRegister(Opc, getPointerTy()), 
                             Callee,InFlag);
    Callee = DAG.getRegister(Opc, getPointerTy());
    // Add register as live out.
    DAG.getMachineFunction().getRegInfo().addLiveOut(Opc);
  }
 
  // Returns a chain & a flag for retval copy to use.
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  SmallVector<SDOperand, 8> Ops;

  if (IsTailCall) {
    Ops.push_back(Chain);
    Ops.push_back(DAG.getIntPtrConstant(NumBytes));
    Ops.push_back(DAG.getIntPtrConstant(0));
    if (InFlag.Val)
      Ops.push_back(InFlag);
    Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
    InFlag = Chain.getValue(1);
 
    // Returns a chain & a flag for retval copy to use.
    NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
    Ops.clear();
  }
  
  Ops.push_back(Chain);
  Ops.push_back(Callee);

  if (IsTailCall)
    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));

  // Add an implicit use GOT pointer in EBX.
  if (!IsTailCall && !Is64Bit &&
      getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      Subtarget->isPICStyleGOT())
    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));
  
  if (InFlag.Val)
    Ops.push_back(InFlag);

  if (IsTailCall) {
    assert(InFlag.Val && 
           "Flag must be set. Depend on flag being set in LowerRET");
    Chain = DAG.getNode(X86ISD::TAILCALL,
                        Op.Val->getVTList(), &Ops[0], Ops.size());
      
    return SDOperand(Chain.Val, Op.ResNo);
  }

  Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size());
  InFlag = Chain.getValue(1);

  // Create the CALLSEQ_END node.
  unsigned NumBytesForCalleeToPush;
  if (IsCalleePop(Op))
    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
  else if (!Is64Bit && IsStructRet)
    // If this is is a call to a struct-return function, the callee
    // pops the hidden struct pointer, so we have to push it back.
    // This is common for Darwin/X86, Linux & Mingw32 targets.
    NumBytesForCalleeToPush = 4;
  else
    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
  
  // Returns a flag for retval copy to use.
  Chain = DAG.getCALLSEQ_END(Chain,
                             DAG.getIntPtrConstant(NumBytes),
                             DAG.getIntPtrConstant(NumBytesForCalleeToPush),
                             InFlag);
  InFlag = Chain.getValue(1);

  // Handle result values, copying them out of physregs into vregs that we
  // return.
  switch (SRetMethod) {
  default:
    return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
  case X86::InGPR64:
    return SDOperand(LowerCallResultToTwo64BitRegs(Chain, InFlag, Op.Val,
                                                   X86::RAX, X86::RDX,
                                                   MVT::i64, DAG), Op.ResNo);
  case X86::InSSE:
    return SDOperand(LowerCallResultToTwo64BitRegs(Chain, InFlag, Op.Val,
                                                   X86::XMM0, X86::XMM1,
                                                   MVT::f64, DAG), Op.ResNo);
  case X86::InX87:
    return SDOperand(LowerCallResultToTwoX87Regs(Chain, InFlag, Op.Val, DAG),
                     Op.ResNo);
  }
}


//===----------------------------------------------------------------------===//
//                Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//

//  Like std call, callee cleans arguments, convention except that ECX is
//  reserved for storing the tail called function address. Only 2 registers are
//  free for argument passing (inreg). Tail call optimization is performed
//  provided:
//                * tailcallopt is enabled
//                * caller/callee are fastcc
//  On X86_64 architecture with GOT-style position independent code only local
//  (within module) calls are supported at the moment.
//  To keep the stack aligned according to platform abi the function
//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
//  If a tail called function callee has more arguments than the caller the
//  caller needs to make sure that there is room to move the RETADDR to. This is
//  achieved by reserving an area the size of the argument delta right after the
//  original REtADDR, but before the saved framepointer or the spilled registers
//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
//  stack layout:
//    arg1
//    arg2
//    RETADDR
//    [ new RETADDR 
//      move area ]
//    (possible EBP)
//    ESI
//    EDI
//    local1 ..

/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
/// for a 16 byte align requirement.
unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 
                                                        SelectionDAG& DAG) {
  if (PerformTailCallOpt) {
    MachineFunction &MF = DAG.getMachineFunction();
    const TargetMachine &TM = MF.getTarget();
    const TargetFrameInfo &TFI = *TM.getFrameInfo();
    unsigned StackAlignment = TFI.getStackAlignment();
    uint64_t AlignMask = StackAlignment - 1; 
    int64_t Offset = StackSize;
    unsigned SlotSize = Subtarget->is64Bit() ? 8 : 4;
    if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
      // Number smaller than 12 so just add the difference.
      Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
    } else {
      // Mask out lower bits, add stackalignment once plus the 12 bytes.
      Offset = ((~AlignMask) & Offset) + StackAlignment + 
        (StackAlignment-SlotSize);
    }
    StackSize = Offset;
  }
  return StackSize;
}

/// IsEligibleForTailCallElimination - Check to see whether the next instruction
/// following the call is a return. A function is eligible if caller/callee
/// calling conventions match, currently only fastcc supports tail calls, and
/// the function CALL is immediatly followed by a RET.
bool X86TargetLowering::IsEligibleForTailCallOptimization(SDOperand Call,
                                                      SDOperand Ret,
                                                      SelectionDAG& DAG) const {
  if (!PerformTailCallOpt)
    return false;

  // Check whether CALL node immediatly preceeds the RET node and whether the
  // return uses the result of the node or is a void return.
  unsigned NumOps = Ret.getNumOperands();
  if ((NumOps == 1 && 
       (Ret.getOperand(0) == SDOperand(Call.Val,1) ||
        Ret.getOperand(0) == SDOperand(Call.Val,0))) ||
      (NumOps > 1 &&
       Ret.getOperand(0) == SDOperand(Call.Val,Call.Val->getNumValues()-1) &&
       Ret.getOperand(1) == SDOperand(Call.Val,0))) {
    MachineFunction &MF = DAG.getMachineFunction();
    unsigned CallerCC = MF.getFunction()->getCallingConv();
    unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue();
    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
      SDOperand Callee = Call.getOperand(4);
      // On x86/32Bit PIC/GOT  tail calls are supported.
      if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
          !Subtarget->isPICStyleGOT()|| !Subtarget->is64Bit())
        return true;

      // Can only do local tail calls (in same module, hidden or protected) on
      // x86_64 PIC/GOT at the moment.
      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
        return G->getGlobal()->hasHiddenVisibility()
            || G->getGlobal()->hasProtectedVisibility();
    }
  }

  return false;
}

//===----------------------------------------------------------------------===//
//                           Other Lowering Hooks
//===----------------------------------------------------------------------===//


SDOperand X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  int ReturnAddrIndex = FuncInfo->getRAIndex();

  if (ReturnAddrIndex == 0) {
    // Set up a frame object for the return address.
    if (Subtarget->is64Bit())
      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(8, -8);
    else
      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, -4);

    FuncInfo->setRAIndex(ReturnAddrIndex);
  }

  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
}


/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code. It returns a false if it cannot do a direct
/// translation. X86CC is the translated CondCode.  LHS/RHS are modified as
/// needed.
static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
                           unsigned &X86CC, SDOperand &LHS, SDOperand &RHS,
                           SelectionDAG &DAG) {
  X86CC = X86::COND_INVALID;
  if (!isFP) {
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
        // X > -1   -> X == 0, jump !sign.
        RHS = DAG.getConstant(0, RHS.getValueType());
        X86CC = X86::COND_NS;
        return true;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
        // X < 0   -> X == 0, jump on sign.
        X86CC = X86::COND_S;
        return true;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->getValue() == 1) {
        // X < 1   -> X <= 0
        RHS = DAG.getConstant(0, RHS.getValueType());
        X86CC = X86::COND_LE;
        return true;
      }
    }

    switch (SetCCOpcode) {
    default: break;
    case ISD::SETEQ:  X86CC = X86::COND_E;  break;
    case ISD::SETGT:  X86CC = X86::COND_G;  break;
    case ISD::SETGE:  X86CC = X86::COND_GE; break;
    case ISD::SETLT:  X86CC = X86::COND_L;  break;
    case ISD::SETLE:  X86CC = X86::COND_LE; break;
    case ISD::SETNE:  X86CC = X86::COND_NE; break;
    case ISD::SETULT: X86CC = X86::COND_B;  break;
    case ISD::SETUGT: X86CC = X86::COND_A;  break;
    case ISD::SETULE: X86CC = X86::COND_BE; break;
    case ISD::SETUGE: X86CC = X86::COND_AE; break;
    }
  } else {
    // On a floating point condition, the flags are set as follows:
    // ZF  PF  CF   op
    //  0 | 0 | 0 | X > Y
    //  0 | 0 | 1 | X < Y
    //  1 | 0 | 0 | X == Y
    //  1 | 1 | 1 | unordered
    bool Flip = false;
    switch (SetCCOpcode) {
    default: break;
    case ISD::SETUEQ:
    case ISD::SETEQ: X86CC = X86::COND_E;  break;
    case ISD::SETOLT: Flip = true; // Fallthrough
    case ISD::SETOGT:
    case ISD::SETGT: X86CC = X86::COND_A;  break;
    case ISD::SETOLE: Flip = true; // Fallthrough
    case ISD::SETOGE:
    case ISD::SETGE: X86CC = X86::COND_AE; break;
    case ISD::SETUGT: Flip = true; // Fallthrough
    case ISD::SETULT:
    case ISD::SETLT: X86CC = X86::COND_B;  break;
    case ISD::SETUGE: Flip = true; // Fallthrough
    case ISD::SETULE:
    case ISD::SETLE: X86CC = X86::COND_BE; break;
    case ISD::SETONE:
    case ISD::SETNE: X86CC = X86::COND_NE; break;
    case ISD::SETUO: X86CC = X86::COND_P;  break;
    case ISD::SETO:  X86CC = X86::COND_NP; break;
    }
    if (Flip)
      std::swap(LHS, RHS);
  }

  return X86CC != X86::COND_INVALID;
}

/// hasFPCMov - is there a floating point cmov for the specific X86 condition
/// code. Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.