X86ISelLowering.cpp

    return !Subtarget->is64Bit();
  case CallingConv::X86_FastCall:
    return !Subtarget->is64Bit();
  case CallingConv::Fast:
    return PerformTailCallOpt;
  }
}

// Selects the correct CCAssignFn for a CALL or FORMAL_ARGUMENTS node.
CCAssignFn *X86TargetLowering::CCAssignFnForNode(SDOperand Op) const {
  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
  
  if (Subtarget->is64Bit())
    if (CC == CallingConv::Fast && PerformTailCallOpt)
      return CC_X86_64_TailCall;
    else
      return CC_X86_64_C;
  
  if (CC == CallingConv::X86_FastCall)
    return CC_X86_32_FastCall;
  else if (CC == CallingConv::Fast && PerformTailCallOpt)
    return CC_X86_32_TailCall;
  else
    return CC_X86_32_C;
}

// Selects the appropriate decoration to apply to a MachineFunction containing a
// given FORMAL_ARGUMENTS node.
NameDecorationStyle
X86TargetLowering::NameDecorationForFORMAL_ARGUMENTS(SDOperand Op) {
  unsigned CC = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
  if (CC == CallingConv::X86_FastCall)
    return FastCall;
  else if (CC == CallingConv::X86_StdCall)
    return StdCall;
  return None;
}


// IsPossiblyOverwrittenArgumentOfTailCall - Check if the operand could possibly
// be overwritten when lowering the outgoing arguments in a tail call. Currently
// the implementation of this call is very conservative and assumes all
// arguments sourcing from FORMAL_ARGUMENTS or a CopyFromReg with virtual
// registers would be overwritten by direct lowering.  
// Possible improvement:
// Check FORMAL_ARGUMENTS corresponding MERGE_VALUES for CopyFromReg nodes
// indicating inreg passed arguments which also need not be lowered to a safe
// stack slot.
static bool IsPossiblyOverwrittenArgumentOfTailCall(SDOperand Op) {
  RegisterSDNode * OpReg = NULL;
  if (Op.getOpcode() == ISD::FORMAL_ARGUMENTS ||
      (Op.getOpcode()== ISD::CopyFromReg &&
       (OpReg = cast<RegisterSDNode>(Op.getOperand(1))) &&
       OpReg->getReg() >= TargetRegisterInfo::FirstVirtualRegister))
    return true;
  return false;
}

// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
// by "Src" to address "Dst" with size and alignment information specified by
// the specific parameter attribute. The copy will be passed as a byval function
// parameter.
static SDOperand 
CreateCopyOfByValArgument(SDOperand Src, SDOperand Dst, SDOperand Chain,
                          unsigned Flags, SelectionDAG &DAG) {
  unsigned Align = 1 <<
    ((Flags & ISD::ParamFlags::ByValAlign) >> ISD::ParamFlags::ByValAlignOffs);
  unsigned Size = (Flags & ISD::ParamFlags::ByValSize) >>
    ISD::ParamFlags::ByValSizeOffs;
  SDOperand AlignNode    = DAG.getConstant(Align, MVT::i32);
  SDOperand SizeNode     = DAG.getConstant(Size, MVT::i32);
  SDOperand AlwaysInline = DAG.getConstant(1, MVT::i32);
  return DAG.getMemcpy(Chain, Dst, Src, SizeNode, AlignNode, AlwaysInline);
}

SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG,
                                              const CCValAssign &VA,
                                              MachineFrameInfo *MFI,
                                              SDOperand Root, unsigned i) {
  // Create the nodes corresponding to a load from this parameter slot.
  unsigned Flags = cast<ConstantSDNode>(Op.getOperand(3 + i))->getValue();
  bool isByVal = Flags & ISD::ParamFlags::ByVal;

  // FIXME: For now, all byval parameter objects are marked mutable. This
  // can be changed with more analysis.
  int FI = MFI->CreateFixedObject(MVT::getSizeInBits(VA.getValVT())/8,
                                  VA.getLocMemOffset(), !isByVal);
  SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
  if (isByVal)
    return FIN;
  return DAG.getLoad(VA.getValVT(), Root, FIN,
                     PseudoSourceValue::getFixedStack(), FI);
}

SDOperand
X86TargetLowering::LowerFORMAL_ARGUMENTS(SDOperand Op, SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  
  const Function* Fn = MF.getFunction();
  if (Fn->hasExternalLinkage() &&
      Subtarget->isTargetCygMing() &&
      Fn->getName() == "main")
    FuncInfo->setForceFramePointer(true);

  // Decorate the function name.
  FuncInfo->setDecorationStyle(NameDecorationForFORMAL_ARGUMENTS(Op));
  
  MachineFrameInfo *MFI = MF.getFrameInfo();
  SDOperand Root = Op.getOperand(0);
  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
  unsigned CC = MF.getFunction()->getCallingConv();
  bool Is64Bit = Subtarget->is64Bit();

  assert(!(isVarArg && CC == CallingConv::Fast) &&
         "Var args not supported with calling convention fastcc");

  // Assign locations to all of the incoming arguments.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
  CCInfo.AnalyzeFormalArguments(Op.Val, CCAssignFnForNode(Op));
  
  SmallVector<SDOperand, 8> ArgValues;
  unsigned LastVal = ~0U;
  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
    // places.
    assert(VA.getValNo() != LastVal &&
           "Don't support value assigned to multiple locs yet");
    LastVal = VA.getValNo();
    
    if (VA.isRegLoc()) {
      MVT::ValueType RegVT = VA.getLocVT();
      TargetRegisterClass *RC;
      if (RegVT == MVT::i32)
        RC = X86::GR32RegisterClass;
      else if (Is64Bit && RegVT == MVT::i64)
        RC = X86::GR64RegisterClass;
      else if (RegVT == MVT::f32)
        RC = X86::FR32RegisterClass;
      else if (RegVT == MVT::f64)
        RC = X86::FR64RegisterClass;
      else {
        assert(MVT::isVector(RegVT));
        if (Is64Bit && MVT::getSizeInBits(RegVT) == 64) {
          RC = X86::GR64RegisterClass;       // MMX values are passed in GPRs.
          RegVT = MVT::i64;
        } else
          RC = X86::VR128RegisterClass;
      }

      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), VA.getLocReg(), RC);
      SDOperand ArgValue = DAG.getCopyFromReg(Root, Reg, RegVT);
      
      // If this is an 8 or 16-bit value, it is really passed promoted to 32
      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
      // right size.
      if (VA.getLocInfo() == CCValAssign::SExt)
        ArgValue = DAG.getNode(ISD::AssertSext, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      else if (VA.getLocInfo() == CCValAssign::ZExt)
        ArgValue = DAG.getNode(ISD::AssertZext, RegVT, ArgValue,
                               DAG.getValueType(VA.getValVT()));
      
      if (VA.getLocInfo() != CCValAssign::Full)
        ArgValue = DAG.getNode(ISD::TRUNCATE, VA.getValVT(), ArgValue);
      
      // Handle MMX values passed in GPRs.
      if (Is64Bit && RegVT != VA.getLocVT() && RC == X86::GR64RegisterClass &&
          MVT::getSizeInBits(RegVT) == 64)
        ArgValue = DAG.getNode(ISD::BIT_CONVERT, VA.getLocVT(), ArgValue);
      
      ArgValues.push_back(ArgValue);
    } else {
      assert(VA.isMemLoc());
      ArgValues.push_back(LowerMemArgument(Op, DAG, VA, MFI, Root, i));
    }
  }

  unsigned StackSize = CCInfo.getNextStackOffset();
  // align stack specially for tail calls
  if (CC == CallingConv::Fast)
    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

  // If the function takes variable number of arguments, make a frame index for
  // the start of the first vararg value... for expansion of llvm.va_start.
  if (isVarArg) {
    if (Is64Bit || CC != CallingConv::X86_FastCall) {
      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
    }
    if (Is64Bit) {
      static const unsigned GPR64ArgRegs[] = {
        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8,  X86::R9
      };
      static const unsigned XMMArgRegs[] = {
        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
      };
      
      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);
      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
    
      // For X86-64, if there are vararg parameters that are passed via
      // registers, then we must store them to their spots on the stack so they
      // may be loaded by deferencing the result of va_next.
      VarArgsGPOffset = NumIntRegs * 8;
      VarArgsFPOffset = 6 * 8 + NumXMMRegs * 16;
      RegSaveFrameIndex = MFI->CreateStackObject(6 * 8 + 8 * 16, 16);
      
      // Store the integer parameter registers.
      SmallVector<SDOperand, 8> MemOps;
      SDOperand RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
      SDOperand FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
                                  DAG.getIntPtrConstant(VarArgsGPOffset));
      for (; NumIntRegs != 6; ++NumIntRegs) {
        unsigned VReg = AddLiveIn(MF, GPR64ArgRegs[NumIntRegs],
                                  X86::GR64RegisterClass);
        SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::i64);
        SDOperand Store =
          DAG.getStore(Val.getValue(1), Val, FIN,
                       PseudoSourceValue::getFixedStack(),
                       RegSaveFrameIndex);
        MemOps.push_back(Store);
        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
                          DAG.getIntPtrConstant(8));
      }
      
      // Now store the XMM (fp + vector) parameter registers.
      FIN = DAG.getNode(ISD::ADD, getPointerTy(), RSFIN,
                        DAG.getIntPtrConstant(VarArgsFPOffset));
      for (; NumXMMRegs != 8; ++NumXMMRegs) {
        unsigned VReg = AddLiveIn(MF, XMMArgRegs[NumXMMRegs],
                                  X86::VR128RegisterClass);
        SDOperand Val = DAG.getCopyFromReg(Root, VReg, MVT::v4f32);
        SDOperand Store =
          DAG.getStore(Val.getValue(1), Val, FIN,
                       PseudoSourceValue::getFixedStack(),
                       RegSaveFrameIndex);
        MemOps.push_back(Store);
        FIN = DAG.getNode(ISD::ADD, getPointerTy(), FIN,
                          DAG.getIntPtrConstant(16));
      }
      if (!MemOps.empty())
          Root = DAG.getNode(ISD::TokenFactor, MVT::Other,
                             &MemOps[0], MemOps.size());
    }
  }
  
  // Make sure the instruction takes 8n+4 bytes to make sure the start of the
  // arguments and the arguments after the retaddr has been pushed are
  // aligned.
  if (!Is64Bit && CC == CallingConv::X86_FastCall &&
      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows() &&
      (StackSize & 7) == 0)
    StackSize += 4;

  ArgValues.push_back(Root);

  // Some CCs need callee pop.
  if (IsCalleePop(Op)) {
    BytesToPopOnReturn  = StackSize; // Callee pops everything.
    BytesCallerReserves = 0;
  } else {
    BytesToPopOnReturn  = 0; // Callee pops nothing.
    // If this is an sret function, the return should pop the hidden pointer.
    if (!Is64Bit && ArgsAreStructReturn(Op))
      BytesToPopOnReturn = 4;  
    BytesCallerReserves = StackSize;
  }

  if (!Is64Bit) {
    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
    if (CC == CallingConv::X86_FastCall)
      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
  }

  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);

  // Return the new list of results.
  return DAG.getNode(ISD::MERGE_VALUES, Op.Val->getVTList(),
                     &ArgValues[0], ArgValues.size()).getValue(Op.ResNo);
}

SDOperand
X86TargetLowering::LowerMemOpCallTo(SDOperand Op, SelectionDAG &DAG,
                                    const SDOperand &StackPtr,
                                    const CCValAssign &VA,
                                    SDOperand Chain,
                                    SDOperand Arg) {
  unsigned LocMemOffset = VA.getLocMemOffset();
  SDOperand PtrOff = DAG.getIntPtrConstant(LocMemOffset);
  PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
  SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo());
  unsigned Flags    = cast<ConstantSDNode>(FlagsOp)->getValue();
  if (Flags & ISD::ParamFlags::ByVal) {
    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG);
  }
  return DAG.getStore(Chain, Arg, PtrOff,
                      PseudoSourceValue::getStack(), LocMemOffset);
}

/// ClassifyX86_64SRetCallReturn - Classify how to implement a x86-64
/// struct return call to the specified function. X86-64 ABI specifies
/// some SRet calls are actually returned in registers. Since current
/// LLVM cannot represent multi-value calls, they are represent as 
/// calls where the results are passed in a hidden struct provided by
/// the caller. This function examines the type of the struct to
/// determine the correct way to implement the call.
X86::X86_64SRet
X86TargetLowering::ClassifyX86_64SRetCallReturn(const Function *Fn) {
  // FIXME: Disabled for now.
  return X86::InMemory;

  const PointerType *PTy = cast<PointerType>(Fn->arg_begin()->getType());
  const Type *RTy = PTy->getElementType();
  unsigned Size = getTargetData()->getABITypeSize(RTy);
  if (Size != 16 && Size != 32)
    return X86::InMemory;

  if (Size == 32) {
    const StructType *STy = dyn_cast<StructType>(RTy);
    if (!STy) return X86::InMemory;
    if (STy->getNumElements() == 2 &&
        STy->getElementType(0) == Type::X86_FP80Ty &&
        STy->getElementType(1) == Type::X86_FP80Ty)
      return X86::InX87;
  }

  bool AllFP = true;
  for (Type::subtype_iterator I = RTy->subtype_begin(), E = RTy->subtype_end();
       I != E; ++I) {
    const Type *STy = I->get();
    if (!STy->isFPOrFPVector()) {
      AllFP = false;
      break;
    }
  }

  if (AllFP)
    return X86::InSSE;
  return X86::InGPR64;
}

void X86TargetLowering::X86_64AnalyzeSRetCallOperands(SDNode *TheCall,
                                                      CCAssignFn *Fn,
                                                      CCState &CCInfo) {
  unsigned NumOps = (TheCall->getNumOperands() - 5) / 2;
  for (unsigned i = 1; i != NumOps; ++i) {
    MVT::ValueType ArgVT = TheCall->getOperand(5+2*i).getValueType();
    SDOperand FlagOp = TheCall->getOperand(5+2*i+1);
    unsigned ArgFlags =cast<ConstantSDNode>(FlagOp)->getValue();
    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo)) {
      cerr << "Call operand #" << i << " has unhandled type "
           << MVT::getValueTypeString(ArgVT) << "\n";
      abort();
    }
  }
}

SDOperand X86TargetLowering::LowerCALL(SDOperand Op, SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  SDOperand Chain     = Op.getOperand(0);
  unsigned CC         = cast<ConstantSDNode>(Op.getOperand(1))->getValue();
  bool isVarArg       = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
  bool IsTailCall     = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0
                        && CC == CallingConv::Fast && PerformTailCallOpt;
  SDOperand Callee    = Op.getOperand(4);
  bool Is64Bit        = Subtarget->is64Bit();
  bool IsStructRet    = CallIsStructReturn(Op);

  assert(!(isVarArg && CC == CallingConv::Fast) &&
         "Var args not supported with calling convention fastcc");

  // Analyze operands of the call, assigning locations to each operand.
  SmallVector<CCValAssign, 16> ArgLocs;
  CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
  CCAssignFn *CCFn = CCAssignFnForNode(Op);

  X86::X86_64SRet SRetMethod = X86::InMemory;
  if (Is64Bit && IsStructRet)
    // FIXME: We can't figure out type of the sret structure for indirect
    // calls. We need to copy more information from CallSite to the ISD::CALL
    // node.
    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
      SRetMethod =
        ClassifyX86_64SRetCallReturn(dyn_cast<Function>(G->getGlobal()));

  // UGLY HACK! For x86-64, some 128-bit aggregates are returns in a pair of
  // registers. Unfortunately, llvm does not support i128 yet so we pretend it's
  // a sret call.
  if (SRetMethod != X86::InMemory)
    X86_64AnalyzeSRetCallOperands(Op.Val, CCFn, CCInfo);
  else 
    CCInfo.AnalyzeCallOperands(Op.Val, CCFn);
  
  // Get a count of how many bytes are to be pushed on the stack.
  unsigned NumBytes = CCInfo.getNextStackOffset();
  if (CC == CallingConv::Fast)
    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

  // Make sure the instruction takes 8n+4 bytes to make sure the start of the
  // arguments and the arguments after the retaddr has been pushed are aligned.
  if (!Is64Bit && CC == CallingConv::X86_FastCall &&
      !Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows() &&
      (NumBytes & 7) == 0)
    NumBytes += 4;

  int FPDiff = 0;
  if (IsTailCall) {
    // Lower arguments at fp - stackoffset + fpdiff.
    unsigned NumBytesCallerPushed = 
      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
    FPDiff = NumBytesCallerPushed - NumBytes;

    // Set the delta of movement of the returnaddr stackslot.
    // But only set if delta is greater than previous delta.
    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
  }

  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes));

  SDOperand RetAddrFrIdx, NewRetAddrFrIdx;
  if (IsTailCall) {
    // Adjust the Return address stack slot.
    if (FPDiff) {
      MVT::ValueType VT = Is64Bit ? MVT::i64 : MVT::i32;
      RetAddrFrIdx = getReturnAddressFrameIndex(DAG);
      // Load the "old" Return address.
      RetAddrFrIdx = 
        DAG.getLoad(VT, Chain,RetAddrFrIdx, NULL, 0);
      // Calculate the new stack slot for the return address.
      int SlotSize = Is64Bit ? 8 : 4;
      int NewReturnAddrFI = 
        MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
      NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
      Chain = SDOperand(RetAddrFrIdx.Val, 1);
    }
  }

  SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
  SmallVector<SDOperand, 8> MemOpChains;

  SDOperand StackPtr;

  // Walk the register/memloc assignments, inserting copies/loads.  For tail
  // calls, lower arguments which could otherwise be possibly overwritten to the
  // stack slot where they would go on normal function calls.
  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
    CCValAssign &VA = ArgLocs[i];
    SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
    
    // Promote the value if needed.
    switch (VA.getLocInfo()) {
    default: assert(0 && "Unknown loc info!");
    case CCValAssign::Full: break;
    case CCValAssign::SExt:
      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
      break;
    case CCValAssign::ZExt:
      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
      break;
    case CCValAssign::AExt:
      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
      break;
    }
    
    if (VA.isRegLoc()) {
      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
    } else {
      if (!IsTailCall || IsPossiblyOverwrittenArgumentOfTailCall(Arg)) {
        assert(VA.isMemLoc());
        if (StackPtr.Val == 0)
          StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy());
        
        MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain,
                                               Arg));
      }
    }
  }
  
  if (!MemOpChains.empty())
    Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                        &MemOpChains[0], MemOpChains.size());

  // Build a sequence of copy-to-reg nodes chained together with token chain
  // and flag operands which copy the outgoing args into registers.
  SDOperand InFlag;
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
    Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
                             InFlag);
    InFlag = Chain.getValue(1);
  }

  if (IsTailCall)
    InFlag = SDOperand(); // ??? Isn't this nuking the preceding loop's output?

  // ELF / PIC requires GOT in the EBX register before function calls via PLT
  // GOT pointer.
  // Does not work with tail call since ebx is not restored correctly by
  // tailcaller. TODO: at least for x86 - verify for x86-64
  if (!IsTailCall && !Is64Bit &&
      getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      Subtarget->isPICStyleGOT()) {
    Chain = DAG.getCopyToReg(Chain, X86::EBX,
                             DAG.getNode(X86ISD::GlobalBaseReg, getPointerTy()),
                             InFlag);
    InFlag = Chain.getValue(1);
  }

  if (Is64Bit && isVarArg) {
    // From AMD64 ABI document:
    // For calls that may call functions that use varargs or stdargs
    // (prototype-less calls or calls to functions containing ellipsis (...) in
    // the declaration) %al is used as hidden argument to specify the number
    // of SSE registers used. The contents of %al do not need to match exactly
    // the number of registers, but must be an ubound on the number of SSE
    // registers used and is in the range 0 - 8 inclusive.
    
    // Count the number of XMM registers allocated.
    static const unsigned XMMArgRegs[] = {
      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
    };
    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
    
    Chain = DAG.getCopyToReg(Chain, X86::AL,
                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
    InFlag = Chain.getValue(1);
  }

  // For tail calls lower the arguments to the 'real' stack slot.
  if (IsTailCall) {
    SmallVector<SDOperand, 8> MemOpChains2;
    SDOperand FIN;
    int FI = 0;
    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
      CCValAssign &VA = ArgLocs[i];
      if (!VA.isRegLoc()) {
        assert(VA.isMemLoc());
        SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
        SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo());
        unsigned Flags    = cast<ConstantSDNode>(FlagsOp)->getValue();
        // Create frame index.
        int32_t Offset = VA.getLocMemOffset()+FPDiff;
        uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
        FIN = DAG.getFrameIndex(FI, MVT::i32);
        SDOperand Source = Arg;
        if (IsPossiblyOverwrittenArgumentOfTailCall(Arg)) {
          // Copy from stack slots to stack slot of a tail called function. This
          // needs to be done because if we would lower the arguments directly
          // to their real stack slot we might end up overwriting each other.
          // Get source stack slot.
          Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
          if (StackPtr.Val == 0)
            StackPtr = DAG.getCopyFromReg(Chain, X86StackPtr, getPointerTy());
          Source = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, Source);
          if ((Flags & ISD::ParamFlags::ByVal)==0) 
            Source = DAG.getLoad(VA.getValVT(), Chain, Source, NULL, 0);
        } 

        if (Flags & ISD::ParamFlags::ByVal) {
          // Copy relative to framepointer.
          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, Chain,
                                                           Flags, DAG));
        } else {
          // Store relative to framepointer.
          MemOpChains2.push_back(
            DAG.getStore(Chain, Source, FIN,
                         PseudoSourceValue::getFixedStack(), FI));
        }            
      }
    }

    if (!MemOpChains2.empty())
      Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
                          &MemOpChains2[0], MemOpChains2.size());

    // Store the return address to the appropriate stack slot.
    if (FPDiff)
      Chain = DAG.getStore(Chain,RetAddrFrIdx, NewRetAddrFrIdx, NULL, 0);
  }

  // If the callee is a GlobalAddress node (quite common, every direct call is)
  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
    // We should use extra load for direct calls to dllimported functions in
    // non-JIT mode.
    if ((IsTailCall || !Is64Bit ||
         getTargetMachine().getCodeModel() != CodeModel::Large)
        && !Subtarget->GVRequiresExtraLoad(G->getGlobal(),
                                           getTargetMachine(), true))
      Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
    if (IsTailCall || !Is64Bit ||
        getTargetMachine().getCodeModel() != CodeModel::Large)
      Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
  } else if (IsTailCall) {
    assert(Callee.getOpcode() == ISD::LOAD && 
           "Function destination must be loaded into virtual register");
    unsigned Opc = Is64Bit ? X86::R9 : X86::ECX;

    Chain = DAG.getCopyToReg(Chain, 
                             DAG.getRegister(Opc, getPointerTy()) , 
                             Callee,InFlag);
    Callee = DAG.getRegister(Opc, getPointerTy());
    // Add register as live out.
    DAG.getMachineFunction().getRegInfo().addLiveOut(Opc);
  }
 
  // Returns a chain & a flag for retval copy to use.
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  SmallVector<SDOperand, 8> Ops;

  if (IsTailCall) {
    Ops.push_back(Chain);
    Ops.push_back(DAG.getIntPtrConstant(NumBytes));
    Ops.push_back(DAG.getIntPtrConstant(0));
    if (InFlag.Val)
      Ops.push_back(InFlag);
    Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
    InFlag = Chain.getValue(1);
 
    // Returns a chain & a flag for retval copy to use.
    NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
    Ops.clear();
  }
  
  Ops.push_back(Chain);
  Ops.push_back(Callee);

  if (IsTailCall)
    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));

  // Add an implicit use GOT pointer in EBX.
  if (!IsTailCall && !Is64Bit &&
      getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
      Subtarget->isPICStyleGOT())
    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));
  
  if (InFlag.Val)
    Ops.push_back(InFlag);

  if (IsTailCall) {
    assert(InFlag.Val && 
           "Flag must be set. Depend on flag being set in LowerRET");
    Chain = DAG.getNode(X86ISD::TAILCALL,
                        Op.Val->getVTList(), &Ops[0], Ops.size());
      
    return SDOperand(Chain.Val, Op.ResNo);
  }

  Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size());
  InFlag = Chain.getValue(1);

  // Create the CALLSEQ_END node.
  unsigned NumBytesForCalleeToPush;
  if (IsCalleePop(Op))
    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
  else if (!Is64Bit && IsStructRet)
    // If this is is a call to a struct-return function, the callee
    // pops the hidden struct pointer, so we have to push it back.
    // This is common for Darwin/X86, Linux & Mingw32 targets.
    NumBytesForCalleeToPush = 4;
  else
    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
  
  // Returns a flag for retval copy to use.
  Chain = DAG.getCALLSEQ_END(Chain,
                             DAG.getIntPtrConstant(NumBytes),
                             DAG.getIntPtrConstant(NumBytesForCalleeToPush),
                             InFlag);
  InFlag = Chain.getValue(1);

  // Handle result values, copying them out of physregs into vregs that we
  // return.
  switch (SRetMethod) {
  default:
    return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
  case X86::InGPR64:
    return SDOperand(LowerCallResultToTwo64BitRegs(Chain, InFlag, Op.Val,
                                                   X86::RAX, X86::RDX,
                                                   MVT::i64, DAG), Op.ResNo);
  case X86::InSSE:
    return SDOperand(LowerCallResultToTwo64BitRegs(Chain, InFlag, Op.Val,
                                                   X86::XMM0, X86::XMM1,
                                                   MVT::f64, DAG), Op.ResNo);
  case X86::InX87:
    return SDOperand(LowerCallResultToTwoX87Regs(Chain, InFlag, Op.Val, DAG),
                     Op.ResNo);
  }
}


//===----------------------------------------------------------------------===//
//                Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//

//  Like std call, callee cleans arguments, convention except that ECX is
//  reserved for storing the tail called function address. Only 2 registers are
//  free for argument passing (inreg). Tail call optimization is performed
//  provided:
//                * tailcallopt is enabled
//                * caller/callee are fastcc
//                * elf/pic is disabled OR
//                * elf/pic enabled + callee is in module + callee has
//                  visibility protected or hidden
//  To keep the stack aligned according to platform abi the function
//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
//  If a tail called function callee has more arguments than the caller the
//  caller needs to make sure that there is room to move the RETADDR to. This is
//  achieved by reserving an area the size of the argument delta right after the
//  original REtADDR, but before the saved framepointer or the spilled registers
//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
//  stack layout:
//    arg1
//    arg2
//    RETADDR
//    [ new RETADDR 
//      move area ]
//    (possible EBP)
//    ESI
//    EDI
//    local1 ..

/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
/// for a 16 byte align requirement.
unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, 
                                                        SelectionDAG& DAG) {
  if (PerformTailCallOpt) {
    MachineFunction &MF = DAG.getMachineFunction();
    const TargetMachine &TM = MF.getTarget();
    const TargetFrameInfo &TFI = *TM.getFrameInfo();
    unsigned StackAlignment = TFI.getStackAlignment();
    uint64_t AlignMask = StackAlignment - 1; 
    int64_t Offset = StackSize;
    unsigned SlotSize = Subtarget->is64Bit() ? 8 : 4;
    if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
      // Number smaller than 12 so just add the difference.
      Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
    } else {
      // Mask out lower bits, add stackalignment once plus the 12 bytes.
      Offset = ((~AlignMask) & Offset) + StackAlignment + 
        (StackAlignment-SlotSize);
    }
    StackSize = Offset;
  }
  return StackSize;
}

/// IsEligibleForTailCallElimination - Check to see whether the next instruction
/// following the call is a return. A function is eligible if caller/callee
/// calling conventions match, currently only fastcc supports tail calls, and
/// the function CALL is immediatly followed by a RET.
bool X86TargetLowering::IsEligibleForTailCallOptimization(SDOperand Call,
                                                      SDOperand Ret,
                                                      SelectionDAG& DAG) const {
  if (!PerformTailCallOpt)
    return false;

  // Check whether CALL node immediatly preceeds the RET node and whether the
  // return uses the result of the node or is a void return.
  unsigned NumOps = Ret.getNumOperands();
  if ((NumOps == 1 && 
       (Ret.getOperand(0) == SDOperand(Call.Val,1) ||
        Ret.getOperand(0) == SDOperand(Call.Val,0))) ||
      (NumOps > 1 &&
       Ret.getOperand(0) == SDOperand(Call.Val,Call.Val->getNumValues()-1) &&
       Ret.getOperand(1) == SDOperand(Call.Val,0))) {
    MachineFunction &MF = DAG.getMachineFunction();
    unsigned CallerCC = MF.getFunction()->getCallingConv();
    unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue();
    if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
      SDOperand Callee = Call.getOperand(4);
      // On elf/pic %ebx needs to be livein.
      if (getTargetMachine().getRelocationModel() != Reloc::PIC_ ||
          !Subtarget->isPICStyleGOT())
        return true;

      // Can only do local tail calls with PIC.
      if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
        return G->getGlobal()->hasHiddenVisibility()
            || G->getGlobal()->hasProtectedVisibility();
    }
  }

  return false;
}

//===----------------------------------------------------------------------===//
//                           Other Lowering Hooks
//===----------------------------------------------------------------------===//


SDOperand X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  int ReturnAddrIndex = FuncInfo->getRAIndex();

  if (ReturnAddrIndex == 0) {
    // Set up a frame object for the return address.
    if (Subtarget->is64Bit())
      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(8, -8);
    else
      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, -4);

    FuncInfo->setRAIndex(ReturnAddrIndex);
  }

  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
}


/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code. It returns a false if it cannot do a direct
/// translation. X86CC is the translated CondCode.  LHS/RHS are modified as
/// needed.
static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
                           unsigned &X86CC, SDOperand &LHS, SDOperand &RHS,
                           SelectionDAG &DAG) {
  X86CC = X86::COND_INVALID;
  if (!isFP) {
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
        // X > -1   -> X == 0, jump !sign.
        RHS = DAG.getConstant(0, RHS.getValueType());
        X86CC = X86::COND_NS;
        return true;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
        // X < 0   -> X == 0, jump on sign.
        X86CC = X86::COND_S;
        return true;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->getValue() == 1) {
        // X < 1   -> X <= 0
        RHS = DAG.getConstant(0, RHS.getValueType());
        X86CC = X86::COND_LE;
        return true;
      }
    }

    switch (SetCCOpcode) {
    default: break;
    case ISD::SETEQ:  X86CC = X86::COND_E;  break;
    case ISD::SETGT:  X86CC = X86::COND_G;  break;
    case ISD::SETGE:  X86CC = X86::COND_GE; break;
    case ISD::SETLT:  X86CC = X86::COND_L;  break;
    case ISD::SETLE:  X86CC = X86::COND_LE; break;
    case ISD::SETNE:  X86CC = X86::COND_NE; break;
    case ISD::SETULT: X86CC = X86::COND_B;  break;
    case ISD::SETUGT: X86CC = X86::COND_A;  break;
    case ISD::SETULE: X86CC = X86::COND_BE; break;
    case ISD::SETUGE: X86CC = X86::COND_AE; break;
    }
  } else {
    // On a floating point condition, the flags are set as follows:
    // ZF  PF  CF   op
    //  0 | 0 | 0 | X > Y
    //  0 | 0 | 1 | X < Y
    //  1 | 0 | 0 | X == Y
    //  1 | 1 | 1 | unordered
    bool Flip = false;
    switch (SetCCOpcode) {
    default: break;
    case ISD::SETUEQ:
    case ISD::SETEQ: X86CC = X86::COND_E;  break;
    case ISD::SETOLT: Flip = true; // Fallthrough
    case ISD::SETOGT:
    case ISD::SETGT: X86CC = X86::COND_A;  break;
    case ISD::SETOLE: Flip = true; // Fallthrough
    case ISD::SETOGE:
    case ISD::SETGE: X86CC = X86::COND_AE; break;
    case ISD::SETUGT: Flip = true; // Fallthrough
    case ISD::SETULT:
    case ISD::SETLT: X86CC = X86::COND_B;  break;
    case ISD::SETUGE: Flip = true; // Fallthrough
    case ISD::SETULE:
    case ISD::SETLE: X86CC = X86::COND_BE; break;
    case ISD::SETONE:
    case ISD::SETNE: X86CC = X86::COND_NE; break;
    case ISD::SETUO: X86CC = X86::COND_P;  break;
    case ISD::SETO:  X86CC = X86::COND_NP; break;
    }
    if (Flip)
      std::swap(LHS, RHS);
  }

  return X86CC != X86::COND_INVALID;
}

/// hasFPCMov - is there a floating point cmov for the specific X86 condition
/// code. Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
static bool hasFPCMov(unsigned X86CC) {
  switch (X86CC) {
  default:
    return false;
  case X86::COND_B:
  case X86::COND_BE:
  case X86::COND_E:
  case X86::COND_P:
  case X86::COND_A:
  case X86::COND_AE:
  case X86::COND_NE:
  case X86::COND_NP:
    return true;
  }
}

/// isUndefOrInRange - Op is either an undef node or a ConstantSDNode.  Return
/// true if Op is undef or if its value falls within the specified range (L, H].
static bool isUndefOrInRange(SDOperand Op, unsigned Low, unsigned Hi) {
  if (Op.getOpcode() == ISD::UNDEF)
    return true;

  unsigned Val = cast<ConstantSDNode>(Op)->getValue();
  return (Val >= Low && Val < Hi);
}

/// isUndefOrEqual - Op is either an undef node or a ConstantSDNode.  Return
/// true if Op is undef or if its value equal to the specified value.
static bool isUndefOrEqual(SDOperand Op, unsigned Val) {
  if (Op.getOpcode() == ISD::UNDEF)
    return true;
  return cast<ConstantSDNode>(Op)->getValue() == Val;
}

/// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to PSHUFD.
bool X86::isPSHUFDMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 2 && N->getNumOperands() != 4)
    return false;

  // Check if the value doesn't reference the second vector.
  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    if (cast<ConstantSDNode>(Arg)->getValue() >= e)
      return false;
  }

  return true;
}

/// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to PSHUFHW.
bool X86::isPSHUFHWMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 8)
    return false;

  // Lower quadword copied in order.
  for (unsigned i = 0; i != 4; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    if (cast<ConstantSDNode>(Arg)->getValue() != i)
      return false;
  }

  // Upper quadword shuffled.
  for (unsigned i = 4; i != 8; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val < 4 || Val > 7)
      return false;
  }

  return true;
}

/// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to PSHUFLW.
bool X86::isPSHUFLWMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 8)
    return false;

  // Upper quadword copied in order.
  for (unsigned i = 4; i != 8; ++i)
    if (!isUndefOrEqual(N->getOperand(i), i))
      return false;

  // Lower quadword shuffled.