X86ISelLowering.cpp


bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
  // i16 instructions are longer (0x66 prefix) and potentially slower.
  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
}

/// isShuffleMaskLegal - Targets can use this to indicate that they only
/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
/// are assumed to be legal.
bool
X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                      EVT VT) const {
  // Very little shuffling can be done for 64-bit vectors right now.
  if (VT.getSizeInBits() == 64)
    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3());

  // FIXME: pshufb, blends, shifts.
  return (VT.getVectorNumElements() == 2 ||
          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
          isMOVLMask(M, VT) ||
          isSHUFPMask(M, VT) ||
          isPSHUFDMask(M, VT) ||
          isPSHUFHWMask(M, VT) ||
          isPSHUFLWMask(M, VT) ||
          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
          isUNPCKLMask(M, VT) ||
          isUNPCKHMask(M, VT) ||
          isUNPCKL_v_undef_Mask(M, VT) ||
          isUNPCKH_v_undef_Mask(M, VT));
}

bool
X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
                                          EVT VT) const {
  unsigned NumElts = VT.getVectorNumElements();
  // FIXME: This collection of masks seems suspect.
  if (NumElts == 2)
    return true;
  if (NumElts == 4 && VT.getSizeInBits() == 128) {
    return (isMOVLMask(Mask, VT)  ||
            isCommutedMOVLMask(Mask, VT, true) ||
            isSHUFPMask(Mask, VT) ||
            isCommutedSHUFPMask(Mask, VT));
  }
  return false;
}

//===----------------------------------------------------------------------===//
//                           X86 Scheduler Hooks
//===----------------------------------------------------------------------===//

// private utility function
MachineBasicBlock *
X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
                                                       MachineBasicBlock *MBB,
                                                       unsigned regOpc,
                                                       unsigned immOpc,
                                                       unsigned LoadOpc,
                                                       unsigned CXchgOpc,
                                                       unsigned notOpc,
                                                       unsigned EAXreg,
                                                       TargetRegisterClass *RC,
                                                       bool invSrc) const {
  // For the atomic bitwise operator, we generate
  //   thisMBB:
  //   newMBB:
  //     ld  t1 = [bitinstr.addr]
  //     op  t2 = t1, [bitinstr.val]
  //     mov EAX = t1
  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
  //     bz  newMBB
  //     fallthrough -->nextMBB
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
  MachineFunction::iterator MBBIter = MBB;
  ++MBBIter;

  /// First build the CFG
  MachineFunction *F = MBB->getParent();
  MachineBasicBlock *thisMBB = MBB;
  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
  F->insert(MBBIter, newMBB);
  F->insert(MBBIter, nextMBB);

  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
  nextMBB->splice(nextMBB->begin(), thisMBB,
                  llvm::next(MachineBasicBlock::iterator(bInstr)),
                  thisMBB->end());
  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

  // Update thisMBB to fall through to newMBB
  thisMBB->addSuccessor(newMBB);

  // newMBB jumps to itself and fall through to nextMBB
  newMBB->addSuccessor(nextMBB);
  newMBB->addSuccessor(newMBB);

  // Insert instructions into newMBB based on incoming instruction
  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
         "unexpected number of operands");
  DebugLoc dl = bInstr->getDebugLoc();
  MachineOperand& destOper = bInstr->getOperand(0);
  MachineOperand* argOpers[2 + X86::AddrNumOperands];
  int numArgs = bInstr->getNumOperands() - 1;
  for (int i=0; i < numArgs; ++i)
    argOpers[i] = &bInstr->getOperand(i+1);

  // x86 address has 4 operands: base, index, scale, and displacement
  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
  int valArgIndx = lastAddrIndx + 1;

  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);

  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
  if (invSrc) {
    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
  }
  else
    tt = t1;

  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
  assert((argOpers[valArgIndx]->isReg() ||
          argOpers[valArgIndx]->isImm()) &&
         "invalid operand");
  if (argOpers[valArgIndx]->isReg())
    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
  else
    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
  MIB.addReg(tt);
  (*MIB).addOperand(*argOpers[valArgIndx]);

  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
  MIB.addReg(t1);

  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);
  MIB.addReg(t2);
  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
  (*MIB).setMemRefs(bInstr->memoperands_begin(),
                    bInstr->memoperands_end());

  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
  MIB.addReg(EAXreg);

  // insert branch
  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);

  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
  return nextMBB;
}

// private utility function:  64 bit atomics on 32 bit host.
MachineBasicBlock *
X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
                                                       MachineBasicBlock *MBB,
                                                       unsigned regOpcL,
                                                       unsigned regOpcH,
                                                       unsigned immOpcL,
                                                       unsigned immOpcH,
                                                       bool invSrc) const {
  // For the atomic bitwise operator, we generate
  //   thisMBB (instructions are in pairs, except cmpxchg8b)
  //     ld t1,t2 = [bitinstr.addr]
  //   newMBB:
  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
  //     op  t5, t6 <- out1, out2, [bitinstr.val]
  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
  //     mov ECX, EBX <- t5, t6
  //     mov EAX, EDX <- t1, t2
  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
  //     mov t3, t4 <- EAX, EDX
  //     bz  newMBB
  //     result in out1, out2
  //     fallthrough -->nextMBB

  const TargetRegisterClass *RC = X86::GR32RegisterClass;
  const unsigned LoadOpc = X86::MOV32rm;
  const unsigned NotOpc = X86::NOT32r;
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
  MachineFunction::iterator MBBIter = MBB;
  ++MBBIter;

  /// First build the CFG
  MachineFunction *F = MBB->getParent();
  MachineBasicBlock *thisMBB = MBB;
  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
  F->insert(MBBIter, newMBB);
  F->insert(MBBIter, nextMBB);

  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
  nextMBB->splice(nextMBB->begin(), thisMBB,
                  llvm::next(MachineBasicBlock::iterator(bInstr)),
                  thisMBB->end());
  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

  // Update thisMBB to fall through to newMBB
  thisMBB->addSuccessor(newMBB);

  // newMBB jumps to itself and fall through to nextMBB
  newMBB->addSuccessor(nextMBB);
  newMBB->addSuccessor(newMBB);

  DebugLoc dl = bInstr->getDebugLoc();
  // Insert instructions into newMBB based on incoming instruction
  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
         "unexpected number of operands");
  MachineOperand& dest1Oper = bInstr->getOperand(0);
  MachineOperand& dest2Oper = bInstr->getOperand(1);
  MachineOperand* argOpers[2 + X86::AddrNumOperands];
  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
    argOpers[i] = &bInstr->getOperand(i+2);

    // We use some of the operands multiple times, so conservatively just
    // clear any kill flags that might be present.
    if (argOpers[i]->isReg() && argOpers[i]->isUse())
      argOpers[i]->setIsKill(false);
  }

  // x86 address has 5 operands: base, index, scale, displacement, and segment.
  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]

  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);
  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
  // add 4 to displacement.
  for (int i=0; i <= lastAddrIndx-2; ++i)
    (*MIB).addOperand(*argOpers[i]);
  MachineOperand newOp3 = *(argOpers[3]);
  if (newOp3.isImm())
    newOp3.setImm(newOp3.getImm()+4);
  else
    newOp3.setOffset(newOp3.getOffset()+4);
  (*MIB).addOperand(newOp3);
  (*MIB).addOperand(*argOpers[lastAddrIndx]);

  // t3/4 are defined later, at the bottom of the loop
  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);

  // The subsequent operations should be using the destination registers of
  //the PHI instructions.
  if (invSrc) {
    t1 = F->getRegInfo().createVirtualRegister(RC);
    t2 = F->getRegInfo().createVirtualRegister(RC);
    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
  } else {
    t1 = dest1Oper.getReg();
    t2 = dest2Oper.getReg();
  }

  int valArgIndx = lastAddrIndx + 1;
  assert((argOpers[valArgIndx]->isReg() ||
          argOpers[valArgIndx]->isImm()) &&
         "invalid operand");
  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
  if (argOpers[valArgIndx]->isReg())
    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
  else
    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
  if (regOpcL != X86::MOV32rr)
    MIB.addReg(t1);
  (*MIB).addOperand(*argOpers[valArgIndx]);
  assert(argOpers[valArgIndx + 1]->isReg() ==
         argOpers[valArgIndx]->isReg());
  assert(argOpers[valArgIndx + 1]->isImm() ==
         argOpers[valArgIndx]->isImm());
  if (argOpers[valArgIndx + 1]->isReg())
    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
  else
    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
  if (regOpcH != X86::MOV32rr)
    MIB.addReg(t2);
  (*MIB).addOperand(*argOpers[valArgIndx + 1]);

  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
  MIB.addReg(t1);
  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
  MIB.addReg(t2);

  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
  MIB.addReg(t5);
  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
  MIB.addReg(t6);

  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);

  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
  (*MIB).setMemRefs(bInstr->memoperands_begin(),
                    bInstr->memoperands_end());

  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
  MIB.addReg(X86::EAX);
  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
  MIB.addReg(X86::EDX);

  // insert branch
  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);

  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
  return nextMBB;
}

// private utility function
MachineBasicBlock *
X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
                                                      MachineBasicBlock *MBB,
                                                      unsigned cmovOpc) const {
  // For the atomic min/max operator, we generate
  //   thisMBB:
  //   newMBB:
  //     ld t1 = [min/max.addr]
  //     mov t2 = [min/max.val]
  //     cmp  t1, t2
  //     cmov[cond] t2 = t1
  //     mov EAX = t1
  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
  //     bz   newMBB
  //     fallthrough -->nextMBB
  //
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
  MachineFunction::iterator MBBIter = MBB;
  ++MBBIter;

  /// First build the CFG
  MachineFunction *F = MBB->getParent();
  MachineBasicBlock *thisMBB = MBB;
  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
  F->insert(MBBIter, newMBB);
  F->insert(MBBIter, nextMBB);

  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
  nextMBB->splice(nextMBB->begin(), thisMBB,
                  llvm::next(MachineBasicBlock::iterator(mInstr)),
                  thisMBB->end());
  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

  // Update thisMBB to fall through to newMBB
  thisMBB->addSuccessor(newMBB);

  // newMBB jumps to newMBB and fall through to nextMBB
  newMBB->addSuccessor(nextMBB);
  newMBB->addSuccessor(newMBB);

  DebugLoc dl = mInstr->getDebugLoc();
  // Insert instructions into newMBB based on incoming instruction
  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
         "unexpected number of operands");
  MachineOperand& destOper = mInstr->getOperand(0);
  MachineOperand* argOpers[2 + X86::AddrNumOperands];
  int numArgs = mInstr->getNumOperands() - 1;
  for (int i=0; i < numArgs; ++i)
    argOpers[i] = &mInstr->getOperand(i+1);

  // x86 address has 4 operands: base, index, scale, and displacement
  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
  int valArgIndx = lastAddrIndx + 1;

  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);

  // We only support register and immediate values
  assert((argOpers[valArgIndx]->isReg() ||
          argOpers[valArgIndx]->isImm()) &&
         "invalid operand");

  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
  if (argOpers[valArgIndx]->isReg())
    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
  else
    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
  (*MIB).addOperand(*argOpers[valArgIndx]);

  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
  MIB.addReg(t1);

  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
  MIB.addReg(t1);
  MIB.addReg(t2);

  // Generate movc
  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
  MIB.addReg(t2);
  MIB.addReg(t1);

  // Cmp and exchange if none has modified the memory location
  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);
  MIB.addReg(t3);
  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
  (*MIB).setMemRefs(mInstr->memoperands_begin(),
                    mInstr->memoperands_end());

  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
  MIB.addReg(X86::EAX);

  // insert branch
  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);

  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
  return nextMBB;
}

// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
// or XMM0_V32I8 in AVX all of this code can be replaced with that
// in the .td file.
MachineBasicBlock *
X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                            unsigned numArgs, bool memArg) const {
  assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) &&
         "Target must have SSE4.2 or AVX features enabled");

  DebugLoc dl = MI->getDebugLoc();
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  unsigned Opc;
  if (!Subtarget->hasAVX()) {
    if (memArg)
      Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
    else
      Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
  } else {
    if (memArg)
      Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
    else
      Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
  }

  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
  for (unsigned i = 0; i < numArgs; ++i) {
    MachineOperand &Op = MI->getOperand(i+1);
    if (!(Op.isReg() && Op.isImplicit()))
      MIB.addOperand(Op);
  }
  BuildMI(*BB, MI, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
    .addReg(X86::XMM0);

  MI->eraseFromParent();
  return BB;
}

MachineBasicBlock *
X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
  DebugLoc dl = MI->getDebugLoc();
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();

  // Address into RAX/EAX, other two args into ECX, EDX.
  unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
  unsigned MemReg = Subtarget->is64Bit() ? X86::RAX : X86::EAX;
  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
  for (int i = 0; i < X86::AddrNumOperands; ++i)
    MIB.addOperand(MI->getOperand(i));

  unsigned ValOps = X86::AddrNumOperands;
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
    .addReg(MI->getOperand(ValOps).getReg());
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
    .addReg(MI->getOperand(ValOps+1).getReg());

  // The instruction doesn't actually take any operands though.
  BuildMI(*BB, MI, dl, TII->get(X86::MONITORrrr));

  MI->eraseFromParent(); // The pseudo is gone now.
  return BB;
}

MachineBasicBlock *
X86TargetLowering::EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const {
  DebugLoc dl = MI->getDebugLoc();
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();

  // First arg in ECX, the second in EAX.
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
    .addReg(MI->getOperand(0).getReg());
  BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
    .addReg(MI->getOperand(1).getReg());

  // The instruction doesn't actually take any operands though.
  BuildMI(*BB, MI, dl, TII->get(X86::MWAITrr));

  MI->eraseFromParent(); // The pseudo is gone now.
  return BB;
}

MachineBasicBlock *
X86TargetLowering::EmitVAARG64WithCustomInserter(
                   MachineInstr *MI,
                   MachineBasicBlock *MBB) const {
  // Emit va_arg instruction on X86-64.

  // Operands to this pseudo-instruction:
  // 0  ) Output        : destination address (reg)
  // 1-5) Input         : va_list address (addr, i64mem)
  // 6  ) ArgSize       : Size (in bytes) of vararg type
  // 7  ) ArgMode       : 0=overflow only, 1=use gp_offset, 2=use fp_offset
  // 8  ) Align         : Alignment of type
  // 9  ) EFLAGS (implicit-def)

  assert(MI->getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
  assert(X86::AddrNumOperands == 5 && "VAARG_64 assumes 5 address operands");

  unsigned DestReg = MI->getOperand(0).getReg();
  MachineOperand &Base = MI->getOperand(1);
  MachineOperand &Scale = MI->getOperand(2);
  MachineOperand &Index = MI->getOperand(3);
  MachineOperand &Disp = MI->getOperand(4);
  MachineOperand &Segment = MI->getOperand(5);
  unsigned ArgSize = MI->getOperand(6).getImm();
  unsigned ArgMode = MI->getOperand(7).getImm();
  unsigned Align = MI->getOperand(8).getImm();

  // Memory Reference
  assert(MI->hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();

  // Machine Information
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
  const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
  DebugLoc DL = MI->getDebugLoc();

  // struct va_list {
  //   i32   gp_offset
  //   i32   fp_offset
  //   i64   overflow_area (address)
  //   i64   reg_save_area (address)
  // }
  // sizeof(va_list) = 24
  // alignment(va_list) = 8

  unsigned TotalNumIntRegs = 6;
  unsigned TotalNumXMMRegs = 8;
  bool UseGPOffset = (ArgMode == 1);
  bool UseFPOffset = (ArgMode == 2);
  unsigned MaxOffset = TotalNumIntRegs * 8 +
                       (UseFPOffset ? TotalNumXMMRegs * 16 : 0);

  /* Align ArgSize to a multiple of 8 */
  unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
  bool NeedsAlign = (Align > 8);

  MachineBasicBlock *thisMBB = MBB;
  MachineBasicBlock *overflowMBB;
  MachineBasicBlock *offsetMBB;
  MachineBasicBlock *endMBB;

  unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
  unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
  unsigned OffsetReg = 0;

  if (!UseGPOffset && !UseFPOffset) {
    // If we only pull from the overflow region, we don't create a branch.
    // We don't need to alter control flow.
    OffsetDestReg = 0; // unused
    OverflowDestReg = DestReg;

    offsetMBB = NULL;
    overflowMBB = thisMBB;
    endMBB = thisMBB;
  } else {
    // First emit code to check if gp_offset (or fp_offset) is below the bound.
    // If so, pull the argument from reg_save_area. (branch to offsetMBB)
    // If not, pull from overflow_area. (branch to overflowMBB)
    //
    //       thisMBB
    //         |     .
    //         |        .
    //     offsetMBB   overflowMBB
    //         |        .
    //         |     .
    //        endMBB

    // Registers for the PHI in endMBB
    OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
    OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

    const BasicBlock *LLVM_BB = MBB->getBasicBlock();
    MachineFunction *MF = MBB->getParent();
    overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
    offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
    endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

    MachineFunction::iterator MBBIter = MBB;
    ++MBBIter;

    // Insert the new basic blocks
    MF->insert(MBBIter, offsetMBB);
    MF->insert(MBBIter, overflowMBB);
    MF->insert(MBBIter, endMBB);

    // Transfer the remainder of MBB and its successor edges to endMBB.
    endMBB->splice(endMBB->begin(), thisMBB,
                    llvm::next(MachineBasicBlock::iterator(MI)),
                    thisMBB->end());
    endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

    // Make offsetMBB and overflowMBB successors of thisMBB
    thisMBB->addSuccessor(offsetMBB);
    thisMBB->addSuccessor(overflowMBB);

    // endMBB is a successor of both offsetMBB and overflowMBB
    offsetMBB->addSuccessor(endMBB);
    overflowMBB->addSuccessor(endMBB);

    // Load the offset value into a register
    OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
    BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
      .addOperand(Base)
      .addOperand(Scale)
      .addOperand(Index)
      .addDisp(Disp, UseFPOffset ? 4 : 0)
      .addOperand(Segment)
      .setMemRefs(MMOBegin, MMOEnd);

    // Check if there is enough room left to pull this argument.
    BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
      .addReg(OffsetReg)
      .addImm(MaxOffset + 8 - ArgSizeA8);

    // Branch to "overflowMBB" if offset >= max
    // Fall through to "offsetMBB" otherwise
    BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
      .addMBB(overflowMBB);
  }

  // In offsetMBB, emit code to use the reg_save_area.
  if (offsetMBB) {
    assert(OffsetReg != 0);

    // Read the reg_save_area address.
    unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
      .addOperand(Base)
      .addOperand(Scale)
      .addOperand(Index)
      .addDisp(Disp, 16)
      .addOperand(Segment)
      .setMemRefs(MMOBegin, MMOEnd);

    // Zero-extend the offset
    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
        .addImm(0)
        .addReg(OffsetReg)
        .addImm(X86::sub_32bit);

    // Add the offset to the reg_save_area to get the final address.
    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
      .addReg(OffsetReg64)
      .addReg(RegSaveReg);

    // Compute the offset for the next argument
    unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
    BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
      .addReg(OffsetReg)
      .addImm(UseFPOffset ? 16 : 8);

    // Store it back into the va_list.
    BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
      .addOperand(Base)
      .addOperand(Scale)
      .addOperand(Index)
      .addDisp(Disp, UseFPOffset ? 4 : 0)
      .addOperand(Segment)
      .addReg(NextOffsetReg)
      .setMemRefs(MMOBegin, MMOEnd);

    // Jump to endMBB
    BuildMI(offsetMBB, DL, TII->get(X86::JMP_4))
      .addMBB(endMBB);
  }

  //
  // Emit code to use overflow area
  //

  // Load the overflow_area address into a register.
  unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
    .addOperand(Base)
    .addOperand(Scale)
    .addOperand(Index)
    .addDisp(Disp, 8)
    .addOperand(Segment)
    .setMemRefs(MMOBegin, MMOEnd);

  // If we need to align it, do so. Otherwise, just copy the address
  // to OverflowDestReg.
  if (NeedsAlign) {
    // Align the overflow address
    assert((Align & (Align-1)) == 0 && "Alignment must be a power of 2");
    unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

    // aligned_addr = (addr + (align-1)) & ~(align-1)
    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
      .addReg(OverflowAddrReg)
      .addImm(Align-1);

    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
      .addReg(TmpReg)
      .addImm(~(uint64_t)(Align-1));
  } else {
    BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
      .addReg(OverflowAddrReg);
  }

  // Compute the next overflow address after this argument.
  // (the overflow address should be kept 8-byte aligned)
  unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
    .addReg(OverflowDestReg)
    .addImm(ArgSizeA8);

  // Store the new overflow address.
  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
    .addOperand(Base)
    .addOperand(Scale)
    .addOperand(Index)
    .addDisp(Disp, 8)
    .addOperand(Segment)
    .addReg(NextAddrReg)
    .setMemRefs(MMOBegin, MMOEnd);

  // If we branched, emit the PHI to the front of endMBB.
  if (offsetMBB) {
    BuildMI(*endMBB, endMBB->begin(), DL,
            TII->get(X86::PHI), DestReg)
      .addReg(OffsetDestReg).addMBB(offsetMBB)
      .addReg(OverflowDestReg).addMBB(overflowMBB);
  }

  // Erase the pseudo instruction
  MI->eraseFromParent();

  return endMBB;
}

MachineBasicBlock *
X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
                                                 MachineInstr *MI,
                                                 MachineBasicBlock *MBB) const {
  // Emit code to save XMM registers to the stack. The ABI says that the
  // number of registers to save is given in %al, so it's theoretically
  // possible to do an indirect jump trick to avoid saving all of them,
  // however this code takes a simpler approach and just executes all
  // of the stores if %al is non-zero. It's less code, and it's probably
  // easier on the hardware branch predictor, and stores aren't all that
  // expensive anyway.

  // Create the new basic blocks. One block contains all the XMM stores,
  // and one block is the final destination regardless of whether any
  // stores were performed.
  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
  MachineFunction *F = MBB->getParent();
  MachineFunction::iterator MBBIter = MBB;
  ++MBBIter;
  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
  F->insert(MBBIter, XMMSaveMBB);
  F->insert(MBBIter, EndMBB);

  // Transfer the remainder of MBB and its successor edges to EndMBB.
  EndMBB->splice(EndMBB->begin(), MBB,
                 llvm::next(MachineBasicBlock::iterator(MI)),
                 MBB->end());
  EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

  // The original block will now fall through to the XMM save block.
  MBB->addSuccessor(XMMSaveMBB);
  // The XMMSaveMBB will fall through to the end block.
  XMMSaveMBB->addSuccessor(EndMBB);

  // Now add the instructions.
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  DebugLoc DL = MI->getDebugLoc();

  unsigned CountReg = MI->getOperand(0).getReg();
  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();

  if (!Subtarget->isTargetWin64()) {
    // If %al is 0, branch around the XMM save block.
    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
    BuildMI(MBB, DL, TII->get(X86::JE_4)).addMBB(EndMBB);
    MBB->addSuccessor(EndMBB);
  }

  // In the XMM save block, save all the XMM argument registers.
  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
    MachineMemOperand *MMO =
      F->getMachineMemOperand(
          MachinePointerInfo::getFixedStack(RegSaveFrameIndex, Offset),
        MachineMemOperand::MOStore,
        /*Size=*/16, /*Align=*/16);
    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
      .addFrameIndex(RegSaveFrameIndex)
      .addImm(/*Scale=*/1)
      .addReg(/*IndexReg=*/0)
      .addImm(/*Disp=*/Offset)
      .addReg(/*Segment=*/0)
      .addReg(MI->getOperand(i).getReg())
      .addMemOperand(MMO);
  }

  MI->eraseFromParent();   // The pseudo instruction is gone now.

  return EndMBB;
}

MachineBasicBlock *
X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
                                     MachineBasicBlock *BB) const {
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  DebugLoc DL = MI->getDebugLoc();

  // To "insert" a SELECT_CC instruction, we actually have to insert the
  // diamond control-flow pattern.  The incoming instruction knows the
  // destination vreg to set, the condition code register to branch on, the
  // true/false values to select between, and a branch opcode to use.
  const BasicBlock *LLVM_BB = BB->getBasicBlock();
  MachineFunction::iterator It = BB;
  ++It;

  //  thisMBB:
  //  ...
  //   TrueVal = ...
  //   cmpTY ccX, r1, r2
  //   bCC copy1MBB
  //   fallthrough --> copy0MBB
  MachineBasicBlock *thisMBB = BB;
  MachineFunction *F = BB->getParent();
  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
  F->insert(It, copy0MBB);
  F->insert(It, sinkMBB);

  // If the EFLAGS register isn't dead in the terminator, then claim that it's
  // live into the sink and copy blocks.
  const MachineFunction *MF = BB->getParent();
  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
  BitVector ReservedRegs = TRI->getReservedRegs(*MF);

  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
    const MachineOperand &MO = MI->getOperand(I);
    if (!MO.isReg() || !MO.isUse() || MO.isKill()) continue;
    unsigned Reg = MO.getReg();
    if (Reg != X86::EFLAGS) continue;
    copy0MBB->addLiveIn(Reg);
    sinkMBB->addLiveIn(Reg);
  }

  // Transfer the remainder of BB and its successor edges to sinkMBB.
  sinkMBB->splice(sinkMBB->begin(), BB,
                  llvm::next(MachineBasicBlock::iterator(MI)),
                  BB->end());
  sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

  // Add the true and fallthrough blocks as its successors.
  BB->addSuccessor(copy0MBB);
  BB->addSuccessor(sinkMBB);

  // Create the conditional branch instruction.
  unsigned Opc =
    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);

  //  copy0MBB:
  //   %FalseValue = ...
  //   # fallthrough to sinkMBB
  copy0MBB->addSuccessor(sinkMBB);

  //  sinkMBB:
  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
  //  ...
  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
          TII->get(X86::PHI), MI->getOperand(0).getReg())
    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);

  MI->eraseFromParent();   // The pseudo instruction is gone now.
  return sinkMBB;
}

MachineBasicBlock *
X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
                                          MachineBasicBlock *BB) const {
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  DebugLoc DL = MI->getDebugLoc();

  assert(!Subtarget->isTargetEnvMacho());

  // The lowering is pretty easy: we're just emitting the call to _alloca.  The
  // non-trivial part is impdef of ESP.

  if (Subtarget->isTargetWin64()) {
    if (Subtarget->isTargetCygMing()) {
      // ___chkstk(Mingw64):
      // Clobbers R10, R11, RAX and EFLAGS.
      // Updates RSP.
      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
        .addExternalSymbol("___chkstk")
        .addReg(X86::RAX, RegState::Implicit)
        .addReg(X86::RSP, RegState::Implicit)
        .addReg(X86::RAX, RegState::Define | RegState::Implicit)
        .addReg(X86::RSP, RegState::Define | RegState::Implicit)
        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
    } else {
      // __chkstk(MSVCRT): does not update stack pointer.
      // Clobbers R10, R11 and EFLAGS.
      // FIXME: RAX(allocated size) might be reused and not killed.
      BuildMI(*BB, MI, DL, TII->get(X86::W64ALLOCA))
        .addExternalSymbol("__chkstk")
        .addReg(X86::RAX, RegState::Implicit)
        .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
      // RAX has the offset to subtracted from RSP.
      BuildMI(*BB, MI, DL, TII->get(X86::SUB64rr), X86::RSP)
        .addReg(X86::RSP)
        .addReg(X86::RAX);
    }
  } else {
    const char *StackProbeSymbol =
      Subtarget->isTargetWindows() ? "_chkstk" : "_alloca";

    BuildMI(*BB, MI, DL, TII->get(X86::CALLpcrel32))
      .addExternalSymbol(StackProbeSymbol)
      .addReg(X86::EAX, RegState::Implicit)
      .addReg(X86::ESP, RegState::Implicit)
      .addReg(X86::EAX, RegState::Define | RegState::Implicit)
      .addReg(X86::ESP, RegState::Define | RegState::Implicit)
      .addReg(X86::EFLAGS, RegState::Define | RegState::Implicit);
  }

  MI->eraseFromParent();   // The pseudo instruction is gone now.
  return BB;
}

MachineBasicBlock *
X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
                                      MachineBasicBlock *BB) const {
  // This is pretty easy.  We're taking the value that we received from
  // our load from the relocation, sticking it in either RDI (x86-64)
  // or EAX and doing an indirect call.  The return value will then
  // be in the normal return register.
  const X86InstrInfo *TII
    = static_cast<const X86InstrInfo*>(getTargetMachine().getInstrInfo());
  DebugLoc DL = MI->getDebugLoc();
  MachineFunction *F = BB->getParent();

  assert(Subtarget->isTargetDarwin() && "Darwin only instr emitted?");
  assert(MI->getOperand(3).isGlobal() && "This should be a global");

  if (Subtarget->is64Bit()) {
    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                      TII->get(X86::MOV64rm), X86::RDI)
    .addReg(X86::RIP)
    .addImm(0).addReg(0)
    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
                      MI->getOperand(3).getTargetFlags())
    .addReg(0);
    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
    addDirectMem(MIB, X86::RDI);
  } else if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,
                                      TII->get(X86::MOV32rm), X86::EAX)
    .addReg(0)
    .addImm(0).addReg(0)
    .addGlobalAddress(MI->getOperand(3).getGlobal(), 0,
                      MI->getOperand(3).getTargetFlags())
    .addReg(0);
    MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
    addDirectMem(MIB, X86::EAX);
  } else {
    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL,