X86ISelLowering.cpp

  int lastAddrIndx = 3; // [0,3]
  int valArgIndx = 4;
  
  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
  MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(X86::MOV32rm), t1);
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);
  
  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
  assert(   (argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm())
         && "invalid operand");
  if (argOpers[valArgIndx]->isReg())
    MIB = BuildMI(newMBB, TII->get(regOpc), t2);
  else
    MIB = BuildMI(newMBB, TII->get(immOpc), t2);
  MIB.addReg(t1);
  (*MIB).addOperand(*argOpers[valArgIndx]);
  
  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), X86::EAX);
  MIB.addReg(t1);
  
  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG32));
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);
  MIB.addReg(t2);
  
  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), destOper.getReg());
  MIB.addReg(X86::EAX);
  
  // insert branch
  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);

  delete bInstr;   // The pseudo instruction is gone now.
  return nextMBB;
}

// private utility function
MachineBasicBlock *
X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
                                                      MachineBasicBlock *MBB,
                                                      unsigned cmovOpc) {
  // For the atomic min/max operator, we generate
  //   thisMBB:
  //   newMBB:
  //     ld t1 = [min/max.addr]
  //     mov t2 = [min/max.val] 
  //     cmp  t1, t2
  //     cmov[cond] t2 = t1
  //     mov EAX = t1
  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
  //     bz   newMBB
  //     fallthrough -->nextMBB
  //
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
  ilist<MachineBasicBlock>::iterator MBBIter = MBB;
  ++MBBIter;
  
  /// First build the CFG
  MachineFunction *F = MBB->getParent();
  MachineBasicBlock *thisMBB = MBB;
  MachineBasicBlock *newMBB = new MachineBasicBlock(LLVM_BB);
  MachineBasicBlock *nextMBB = new MachineBasicBlock(LLVM_BB);
  F->getBasicBlockList().insert(MBBIter, newMBB);
  F->getBasicBlockList().insert(MBBIter, nextMBB);
  
  // Move all successors to thisMBB to nextMBB
  nextMBB->transferSuccessors(thisMBB);
  
  // Update thisMBB to fall through to newMBB
  thisMBB->addSuccessor(newMBB);
  
  // newMBB jumps to newMBB and fall through to nextMBB
  newMBB->addSuccessor(nextMBB);
  newMBB->addSuccessor(newMBB);
  
  // Insert instructions into newMBB based on incoming instruction
  assert(mInstr->getNumOperands() < 8 && "unexpected number of operands");
  MachineOperand& destOper = mInstr->getOperand(0);
  MachineOperand* argOpers[6];
  int numArgs = mInstr->getNumOperands() - 1;
  for (int i=0; i < numArgs; ++i)
    argOpers[i] = &mInstr->getOperand(i+1);
  
  // x86 address has 4 operands: base, index, scale, and displacement
  int lastAddrIndx = 3; // [0,3]
  int valArgIndx = 4;
  
  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
  MachineInstrBuilder MIB = BuildMI(newMBB, TII->get(X86::MOV32rm), t1);
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);

  // We only support register and immediate values
  assert(   (argOpers[valArgIndx]->isReg() || argOpers[valArgIndx]->isImm())
         && "invalid operand");
  
  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);  
  if (argOpers[valArgIndx]->isReg())
    MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2);
  else 
    MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), t2);
  (*MIB).addOperand(*argOpers[valArgIndx]);

  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), X86::EAX);
  MIB.addReg(t1);

  MIB = BuildMI(newMBB, TII->get(X86::CMP32rr));
  MIB.addReg(t1);
  MIB.addReg(t2);

  // Generate movc
  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
  MIB = BuildMI(newMBB, TII->get(cmovOpc),t3);
  MIB.addReg(t2);
  MIB.addReg(t1);

  // Cmp and exchange if none has modified the memory location
  MIB = BuildMI(newMBB, TII->get(X86::LCMPXCHG32));
  for (int i=0; i <= lastAddrIndx; ++i)
    (*MIB).addOperand(*argOpers[i]);
  MIB.addReg(t3);
  
  MIB = BuildMI(newMBB, TII->get(X86::MOV32rr), destOper.getReg());
  MIB.addReg(X86::EAX);
  
  // insert branch
  BuildMI(newMBB, TII->get(X86::JNE)).addMBB(newMBB);

  delete mInstr;   // The pseudo instruction is gone now.
  return nextMBB;
}


MachineBasicBlock *
X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                               MachineBasicBlock *BB) {
  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
  switch (MI->getOpcode()) {
  default: assert(false && "Unexpected instr type to insert");
  case X86::CMOV_FR32:
  case X86::CMOV_FR64:
  case X86::CMOV_V4F32:
  case X86::CMOV_V2F64:
  case X86::CMOV_V2I64: {
    // To "insert" a SELECT_CC instruction, we actually have to insert the
    // diamond control-flow pattern.  The incoming instruction knows the
    // destination vreg to set, the condition code register to branch on, the
    // true/false values to select between, and a branch opcode to use.
    const BasicBlock *LLVM_BB = BB->getBasicBlock();
    ilist<MachineBasicBlock>::iterator It = BB;
    ++It;

    //  thisMBB:
    //  ...
    //   TrueVal = ...
    //   cmpTY ccX, r1, r2
    //   bCC copy1MBB
    //   fallthrough --> copy0MBB
    MachineBasicBlock *thisMBB = BB;
    MachineBasicBlock *copy0MBB = new MachineBasicBlock(LLVM_BB);
    MachineBasicBlock *sinkMBB = new MachineBasicBlock(LLVM_BB);
    unsigned Opc =
      X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
    BuildMI(BB, TII->get(Opc)).addMBB(sinkMBB);
    MachineFunction *F = BB->getParent();
    F->getBasicBlockList().insert(It, copy0MBB);
    F->getBasicBlockList().insert(It, sinkMBB);
    // Update machine-CFG edges by transferring all successors of the current
    // block to the new block which will contain the Phi node for the select.
    sinkMBB->transferSuccessors(BB);

    // Add the true and fallthrough blocks as its successors.
    BB->addSuccessor(copy0MBB);
    BB->addSuccessor(sinkMBB);

    //  copy0MBB:
    //   %FalseValue = ...
    //   # fallthrough to sinkMBB
    BB = copy0MBB;

    // Update machine-CFG edges
    BB->addSuccessor(sinkMBB);

    //  sinkMBB:
    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
    //  ...
    BB = sinkMBB;
    BuildMI(BB, TII->get(X86::PHI), MI->getOperand(0).getReg())
      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);

    delete MI;   // The pseudo instruction is gone now.
    return BB;
  }

  case X86::FP32_TO_INT16_IN_MEM:
  case X86::FP32_TO_INT32_IN_MEM:
  case X86::FP32_TO_INT64_IN_MEM:
  case X86::FP64_TO_INT16_IN_MEM:
  case X86::FP64_TO_INT32_IN_MEM:
  case X86::FP64_TO_INT64_IN_MEM:
  case X86::FP80_TO_INT16_IN_MEM:
  case X86::FP80_TO_INT32_IN_MEM:
  case X86::FP80_TO_INT64_IN_MEM: {
    // Change the floating point control register to use "round towards zero"
    // mode when truncating to an integer value.
    MachineFunction *F = BB->getParent();
    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
    addFrameReference(BuildMI(BB, TII->get(X86::FNSTCW16m)), CWFrameIdx);

    // Load the old value of the high byte of the control word...
    unsigned OldCW =
      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
    addFrameReference(BuildMI(BB, TII->get(X86::MOV16rm), OldCW), CWFrameIdx);

    // Set the high part to be round to zero...
    addFrameReference(BuildMI(BB, TII->get(X86::MOV16mi)), CWFrameIdx)
      .addImm(0xC7F);

    // Reload the modified control word now...
    addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);

    // Restore the memory image of control word to original value
    addFrameReference(BuildMI(BB, TII->get(X86::MOV16mr)), CWFrameIdx)
      .addReg(OldCW);

    // Get the X86 opcode to use.
    unsigned Opc;
    switch (MI->getOpcode()) {
    default: assert(0 && "illegal opcode!");
    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
    }

    X86AddressMode AM;
    MachineOperand &Op = MI->getOperand(0);
    if (Op.isRegister()) {
      AM.BaseType = X86AddressMode::RegBase;
      AM.Base.Reg = Op.getReg();
    } else {
      AM.BaseType = X86AddressMode::FrameIndexBase;
      AM.Base.FrameIndex = Op.getIndex();
    }
    Op = MI->getOperand(1);
    if (Op.isImmediate())
      AM.Scale = Op.getImm();
    Op = MI->getOperand(2);
    if (Op.isImmediate())
      AM.IndexReg = Op.getImm();
    Op = MI->getOperand(3);
    if (Op.isGlobalAddress()) {
      AM.GV = Op.getGlobal();
    } else {
      AM.Disp = Op.getImm();
    }
    addFullAddress(BuildMI(BB, TII->get(Opc)), AM)
                      .addReg(MI->getOperand(4).getReg());

    // Reload the original control word now.
    addFrameReference(BuildMI(BB, TII->get(X86::FLDCW16m)), CWFrameIdx);

    delete MI;   // The pseudo instruction is gone now.
    return BB;
  }
  case X86::ATOMAND32:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
                                                       X86::AND32ri);
  case X86::ATOMOR32:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr, 
                                                       X86::OR32ri);
  case X86::ATOMXOR32:
    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
                                                       X86::XOR32ri);
  case X86::ATOMMIN32:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
  case X86::ATOMMAX32:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
  case X86::ATOMUMIN32:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
  case X86::ATOMUMAX32:
    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
  }
}

//===----------------------------------------------------------------------===//
//                           X86 Optimization Hooks
//===----------------------------------------------------------------------===//

void X86TargetLowering::computeMaskedBitsForTargetNode(const SDOperand Op,
                                                       const APInt &Mask,
                                                       APInt &KnownZero,
                                                       APInt &KnownOne,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
  unsigned Opc = Op.getOpcode();
  assert((Opc >= ISD::BUILTIN_OP_END ||
          Opc == ISD::INTRINSIC_WO_CHAIN ||
          Opc == ISD::INTRINSIC_W_CHAIN ||
          Opc == ISD::INTRINSIC_VOID) &&
         "Should use MaskedValueIsZero if you don't know whether Op"
         " is a target node!");

  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
  switch (Opc) {
  default: break;
  case X86ISD::SETCC:
    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
                                       Mask.getBitWidth() - 1);
    break;
  }
}

/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
/// node is a GlobalAddress + offset.
bool X86TargetLowering::isGAPlusOffset(SDNode *N,
                                       GlobalValue* &GA, int64_t &Offset) const{
  if (N->getOpcode() == X86ISD::Wrapper) {
    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
      return true;
    }
  }
  return TargetLowering::isGAPlusOffset(N, GA, Offset);
}

static bool isBaseAlignmentOfN(unsigned N, SDNode *Base,
                               const TargetLowering &TLI) {
  GlobalValue *GV;
  int64_t Offset = 0;
  if (TLI.isGAPlusOffset(Base, GV, Offset))
    return (GV->getAlignment() >= N && (Offset % N) == 0);
  // DAG combine handles the stack object case.
  return false;
}

static bool EltsFromConsecutiveLoads(SDNode *N, SDOperand PermMask,
                                     unsigned NumElems, MVT::ValueType EVT,
                                     SDNode *&Base,
                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
                                     const TargetLowering &TLI) {
  Base = NULL;
  for (unsigned i = 0; i < NumElems; ++i) {
    SDOperand Idx = PermMask.getOperand(i);
    if (Idx.getOpcode() == ISD::UNDEF) {
      if (!Base)
        return false;
      continue;
    }

    unsigned Index = cast<ConstantSDNode>(Idx)->getValue();
    SDOperand Elt = DAG.getShuffleScalarElt(N, Index);
    if (!Elt.Val ||
        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.Val)))
      return false;
    if (!Base) {
      Base = Elt.Val;
      if (Base->getOpcode() == ISD::UNDEF)
        return false;
      continue;
    }
    if (Elt.getOpcode() == ISD::UNDEF)
      continue;

    if (!TLI.isConsecutiveLoad(Elt.Val, Base,
                               MVT::getSizeInBits(EVT)/8, i, MFI))
      return false;
  }
  return true;
}

/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
/// if the load addresses are consecutive, non-overlapping, and in the right
/// order.
static SDOperand PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
                                       const TargetLowering &TLI) {
  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
  MVT::ValueType VT = N->getValueType(0);
  MVT::ValueType EVT = MVT::getVectorElementType(VT);
  SDOperand PermMask = N->getOperand(2);
  unsigned NumElems = PermMask.getNumOperands();
  SDNode *Base = NULL;
  if (!EltsFromConsecutiveLoads(N, PermMask, NumElems, EVT, Base,
                                DAG, MFI, TLI))
    return SDOperand();

  LoadSDNode *LD = cast<LoadSDNode>(Base);
  if (isBaseAlignmentOfN(16, Base->getOperand(1).Val, TLI))
    return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
                       LD->getSrcValueOffset(), LD->isVolatile());
  return DAG.getLoad(VT, LD->getChain(), LD->getBasePtr(), LD->getSrcValue(),
                     LD->getSrcValueOffset(), LD->isVolatile(),
                     LD->getAlignment());
}

/// PerformBuildVectorCombine - build_vector 0,(load i64 / f64) -> movq / movsd.
static SDOperand PerformBuildVectorCombine(SDNode *N, SelectionDAG &DAG,
                                           const X86Subtarget *Subtarget,
                                           const TargetLowering &TLI) {
  unsigned NumOps = N->getNumOperands();

  // Ignore single operand BUILD_VECTOR.
  if (NumOps == 1)
    return SDOperand();

  MVT::ValueType VT = N->getValueType(0);
  MVT::ValueType EVT = MVT::getVectorElementType(VT);
  if ((EVT != MVT::i64 && EVT != MVT::f64) || Subtarget->is64Bit())
    // We are looking for load i64 and zero extend. We want to transform
    // it before legalizer has a chance to expand it. Also look for i64
    // BUILD_PAIR bit casted to f64.
    return SDOperand();
  // This must be an insertion into a zero vector.
  SDOperand HighElt = N->getOperand(1);
  if (!isZeroNode(HighElt))
    return SDOperand();

  // Value must be a load.
  SDNode *Base = N->getOperand(0).Val;
  if (!isa<LoadSDNode>(Base)) {
    if (Base->getOpcode() != ISD::BIT_CONVERT)
      return SDOperand();
    Base = Base->getOperand(0).Val;
    if (!isa<LoadSDNode>(Base))
      return SDOperand();
  }

  // Transform it into VZEXT_LOAD addr.
  LoadSDNode *LD = cast<LoadSDNode>(Base);
  
  // Load must not be an extload.
  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
    return SDOperand();
  
  return DAG.getNode(X86ISD::VZEXT_LOAD, VT, LD->getChain(), LD->getBasePtr());
}                                           

/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
static SDOperand PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
                                      const X86Subtarget *Subtarget) {
  SDOperand Cond = N->getOperand(0);

  // If we have SSE[12] support, try to form min/max nodes.
  if (Subtarget->hasSSE2() &&
      (N->getValueType(0) == MVT::f32 || N->getValueType(0) == MVT::f64)) {
    if (Cond.getOpcode() == ISD::SETCC) {
      // Get the LHS/RHS of the select.
      SDOperand LHS = N->getOperand(1);
      SDOperand RHS = N->getOperand(2);
      ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

      unsigned Opcode = 0;
      if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
        switch (CC) {
        default: break;
        case ISD::SETOLE: // (X <= Y) ? X : Y -> min
        case ISD::SETULE:
        case ISD::SETLE:
          if (!UnsafeFPMath) break;
          // FALL THROUGH.
        case ISD::SETOLT:  // (X olt/lt Y) ? X : Y -> min
        case ISD::SETLT:
          Opcode = X86ISD::FMIN;
          break;

        case ISD::SETOGT: // (X > Y) ? X : Y -> max
        case ISD::SETUGT:
        case ISD::SETGT:
          if (!UnsafeFPMath) break;
          // FALL THROUGH.
        case ISD::SETUGE:  // (X uge/ge Y) ? X : Y -> max
        case ISD::SETGE:
          Opcode = X86ISD::FMAX;
          break;
        }
      } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
        switch (CC) {
        default: break;
        case ISD::SETOGT: // (X > Y) ? Y : X -> min
        case ISD::SETUGT:
        case ISD::SETGT:
          if (!UnsafeFPMath) break;
          // FALL THROUGH.
        case ISD::SETUGE:  // (X uge/ge Y) ? Y : X -> min
        case ISD::SETGE:
          Opcode = X86ISD::FMIN;
          break;

        case ISD::SETOLE:   // (X <= Y) ? Y : X -> max
        case ISD::SETULE:
        case ISD::SETLE:
          if (!UnsafeFPMath) break;
          // FALL THROUGH.
        case ISD::SETOLT:   // (X olt/lt Y) ? Y : X -> max
        case ISD::SETLT:
          Opcode = X86ISD::FMAX;
          break;
        }
      }

      if (Opcode)
        return DAG.getNode(Opcode, N->getValueType(0), LHS, RHS);
    }

  }

  return SDOperand();
}

/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
static SDOperand PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget *Subtarget) {
  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
  // the FP state in cases where an emms may be missing.
  // A preferable solution to the general problem is to figure out the right
  // places to insert EMMS.  This qualifies as a quick hack.
  StoreSDNode *St = cast<StoreSDNode>(N);
  if (MVT::isVector(St->getValue().getValueType()) && 
      MVT::getSizeInBits(St->getValue().getValueType()) == 64 &&
      isa<LoadSDNode>(St->getValue()) &&
      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
      St->getChain().hasOneUse() && !St->isVolatile()) {
    SDNode* LdVal = St->getValue().Val;
    LoadSDNode *Ld = 0;
    int TokenFactorIndex = -1;
    SmallVector<SDOperand, 8> Ops;
    SDNode* ChainVal = St->getChain().Val;
    // Must be a store of a load.  We currently handle two cases:  the load
    // is a direct child, and it's under an intervening TokenFactor.  It is
    // possible to dig deeper under nested TokenFactors.
    if (ChainVal == LdVal)
      Ld = cast<LoadSDNode>(St->getChain());
    else if (St->getValue().hasOneUse() &&
             ChainVal->getOpcode() == ISD::TokenFactor) {
      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
        if (ChainVal->getOperand(i).Val == LdVal) {
          TokenFactorIndex = i;
          Ld = cast<LoadSDNode>(St->getValue());
        } else
          Ops.push_back(ChainVal->getOperand(i));
      }
    }
    if (Ld) {
      // If we are a 64-bit capable x86, lower to a single movq load/store pair.
      if (Subtarget->is64Bit()) {
        SDOperand NewLd = DAG.getLoad(MVT::i64, Ld->getChain(), 
                                      Ld->getBasePtr(), Ld->getSrcValue(), 
                                      Ld->getSrcValueOffset(), Ld->isVolatile(),
                                      Ld->getAlignment());
        SDOperand NewChain = NewLd.getValue(1);
        if (TokenFactorIndex != -1) {
          Ops.push_back(NewChain);
          NewChain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Ops[0], 
                                 Ops.size());
        }
        return DAG.getStore(NewChain, NewLd, St->getBasePtr(),
                            St->getSrcValue(), St->getSrcValueOffset(),
                            St->isVolatile(), St->getAlignment());
      }

      // Otherwise, lower to two 32-bit copies.
      SDOperand LoAddr = Ld->getBasePtr();
      SDOperand HiAddr = DAG.getNode(ISD::ADD, MVT::i32, LoAddr,
                                     DAG.getConstant(MVT::i32, 4));

      SDOperand LoLd = DAG.getLoad(MVT::i32, Ld->getChain(), LoAddr,
                                   Ld->getSrcValue(), Ld->getSrcValueOffset(),
                                   Ld->isVolatile(), Ld->getAlignment());
      SDOperand HiLd = DAG.getLoad(MVT::i32, Ld->getChain(), HiAddr,
                                   Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
                                   Ld->isVolatile(), 
                                   MinAlign(Ld->getAlignment(), 4));

      SDOperand NewChain = LoLd.getValue(1);
      if (TokenFactorIndex != -1) {
        Ops.push_back(LoLd);
        Ops.push_back(HiLd);
        NewChain = DAG.getNode(ISD::TokenFactor, MVT::Other, &Ops[0], 
                               Ops.size());
      }

      LoAddr = St->getBasePtr();
      HiAddr = DAG.getNode(ISD::ADD, MVT::i32, LoAddr,
                           DAG.getConstant(MVT::i32, 4));

      SDOperand LoSt = DAG.getStore(NewChain, LoLd, LoAddr,
                          St->getSrcValue(), St->getSrcValueOffset(),
                          St->isVolatile(), St->getAlignment());
      SDOperand HiSt = DAG.getStore(NewChain, HiLd, HiAddr,
                                    St->getSrcValue(), St->getSrcValueOffset()+4,
                                    St->isVolatile(), 
                                    MinAlign(St->getAlignment(), 4));
      return DAG.getNode(ISD::TokenFactor, MVT::Other, LoSt, HiSt);
    }
  }
  return SDOperand();
}

/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
/// X86ISD::FXOR nodes.
static SDOperand PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
  // F[X]OR(0.0, x) -> x
  // F[X]OR(x, 0.0) -> x
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
    if (C->getValueAPF().isPosZero())
      return N->getOperand(1);
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
    if (C->getValueAPF().isPosZero())
      return N->getOperand(0);
  return SDOperand();
}

/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
static SDOperand PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
  // FAND(0.0, x) -> 0.0
  // FAND(x, 0.0) -> 0.0
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
    if (C->getValueAPF().isPosZero())
      return N->getOperand(0);
  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
    if (C->getValueAPF().isPosZero())
      return N->getOperand(1);
  return SDOperand();
}


SDOperand X86TargetLowering::PerformDAGCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
  SelectionDAG &DAG = DCI.DAG;
  switch (N->getOpcode()) {
  default: break;
  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
  case ISD::BUILD_VECTOR:
    return PerformBuildVectorCombine(N, DAG, Subtarget, *this);
  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
  case X86ISD::FXOR:
  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
  }

  return SDOperand();
}

//===----------------------------------------------------------------------===//
//                           X86 Inline Assembly Support
//===----------------------------------------------------------------------===//

/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
X86TargetLowering::ConstraintType
X86TargetLowering::getConstraintType(const std::string &Constraint) const {
  if (Constraint.size() == 1) {
    switch (Constraint[0]) {
    case 'A':
    case 'f':
    case 'r':
    case 'R':
    case 'l':
    case 'q':
    case 'Q':
    case 'x':
    case 'y':
    case 'Y':
      return C_RegisterClass;
    default:
      break;
    }
  }
  return TargetLowering::getConstraintType(Constraint);
}

/// LowerXConstraint - try to replace an X constraint, which matches anything,
/// with another that has more specific requirements based on the type of the
/// corresponding operand.
const char *X86TargetLowering::
LowerXConstraint(MVT::ValueType ConstraintVT) const {
  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
  // 'f' like normal targets.
  if (MVT::isFloatingPoint(ConstraintVT)) {
    if (Subtarget->hasSSE2())
      return "Y";
    if (Subtarget->hasSSE1())
      return "x";
  }
  
  return TargetLowering::LowerXConstraint(ConstraintVT);
}

/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
/// vector.  If it is invalid, don't add anything to Ops.
void X86TargetLowering::LowerAsmOperandForConstraint(SDOperand Op,
                                                     char Constraint,
                                                     std::vector<SDOperand>&Ops,
                                                     SelectionDAG &DAG) const {
  SDOperand Result(0, 0);
  
  switch (Constraint) {
  default: break;
  case 'I':
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
      if (C->getValue() <= 31) {
        Result = DAG.getTargetConstant(C->getValue(), Op.getValueType());
        break;
      }
    }
    return;
  case 'N':
    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
      if (C->getValue() <= 255) {
        Result = DAG.getTargetConstant(C->getValue(), Op.getValueType());
        break;
      }
    }
    return;
  case 'i': {
    // Literal immediates are always ok.
    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
      Result = DAG.getTargetConstant(CST->getValue(), Op.getValueType());
      break;
    }

    // If we are in non-pic codegen mode, we allow the address of a global (with
    // an optional displacement) to be used with 'i'.
    GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
    int64_t Offset = 0;
    
    // Match either (GA) or (GA+C)
    if (GA) {
      Offset = GA->getOffset();
    } else if (Op.getOpcode() == ISD::ADD) {
      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
      GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
      if (C && GA) {
        Offset = GA->getOffset()+C->getValue();
      } else {
        C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
        if (C && GA)
          Offset = GA->getOffset()+C->getValue();
        else
          C = 0, GA = 0;
      }
    }
    
    if (GA) {
      // If addressing this global requires a load (e.g. in PIC mode), we can't
      // match.
      if (Subtarget->GVRequiresExtraLoad(GA->getGlobal(), getTargetMachine(),
                                         false))
        return;

      Op = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
                                      Offset);
      Result = Op;
      break;
    }

    // Otherwise, not valid for this mode.
    return;
  }
  }
  
  if (Result.Val) {
    Ops.push_back(Result);
    return;
  }
  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
}

std::vector<unsigned> X86TargetLowering::
getRegClassForInlineAsmConstraint(const std::string &Constraint,
                                  MVT::ValueType VT) const {
  if (Constraint.size() == 1) {
    // FIXME: not handling fp-stack yet!
    switch (Constraint[0]) {      // GCC X86 Constraint Letters
    default: break;  // Unknown constraint letter
    case 'A':   // EAX/EDX
      if (VT == MVT::i32 || VT == MVT::i64)
        return make_vector<unsigned>(X86::EAX, X86::EDX, 0);
      break;
    case 'q':   // Q_REGS (GENERAL_REGS in 64-bit mode)
    case 'Q':   // Q_REGS
      if (VT == MVT::i32)
        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
      else if (VT == MVT::i16)
        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
      else if (VT == MVT::i8)
        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
      else if (VT == MVT::i64)
        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
      break;
    }
  }

  return std::vector<unsigned>();
}

std::pair<unsigned, const TargetRegisterClass*>
X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
                                                MVT::ValueType VT) const {
  // First, see if this is a constraint that directly corresponds to an LLVM
  // register class.
  if (Constraint.size() == 1) {
    // GCC Constraint Letters
    switch (Constraint[0]) {
    default: break;
    case 'r':   // GENERAL_REGS
    case 'R':   // LEGACY_REGS
    case 'l':   // INDEX_REGS
      if (VT == MVT::i64 && Subtarget->is64Bit())
        return std::make_pair(0U, X86::GR64RegisterClass);
      if (VT == MVT::i32)
        return std::make_pair(0U, X86::GR32RegisterClass);
      else if (VT == MVT::i16)
        return std::make_pair(0U, X86::GR16RegisterClass);
      else if (VT == MVT::i8)
        return std::make_pair(0U, X86::GR8RegisterClass);
      break;
    case 'f':  // FP Stack registers.
      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
      // value to the correct fpstack register class.
      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
        return std::make_pair(0U, X86::RFP32RegisterClass);
      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
        return std::make_pair(0U, X86::RFP64RegisterClass);
      return std::make_pair(0U, X86::RFP80RegisterClass);
    case 'y':   // MMX_REGS if MMX allowed.
      if (!Subtarget->hasMMX()) break;
      return std::make_pair(0U, X86::VR64RegisterClass);
      break;
    case 'Y':   // SSE_REGS if SSE2 allowed
      if (!Subtarget->hasSSE2()) break;
      // FALL THROUGH.
    case 'x':   // SSE_REGS if SSE1 allowed
      if (!Subtarget->hasSSE1()) break;
      
      switch (VT) {
      default: break;
      // Scalar SSE types.
      case MVT::f32:
      case MVT::i32:
        return std::make_pair(0U, X86::FR32RegisterClass);
      case MVT::f64:
      case MVT::i64:
        return std::make_pair(0U, X86::FR64RegisterClass);
      // Vector types.
      case MVT::v16i8:
      case MVT::v8i16:
      case MVT::v4i32:
      case MVT::v2i64:
      case MVT::v4f32:
      case MVT::v2f64:
        return std::make_pair(0U, X86::VR128RegisterClass);
      }
      break;
    }
  }
  
  // Use the default implementation in TargetLowering to convert the register
  // constraint into a member of a register class.
  std::pair<unsigned, const TargetRegisterClass*> Res;
  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);

  // Not found as a standard register?
  if (Res.second == 0) {
    // GCC calls "st(0)" just plain "st".
    if (StringsEqualNoCase("{st}", Constraint)) {
      Res.first = X86::ST0;
      Res.second = X86::RFP80RegisterClass;
    }

    return Res;
  }

  // Otherwise, check to see if this is a register class of the wrong value
  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
  // turn into {ax},{dx}.
  if (Res.second->hasType(VT))
    return Res;   // Correct type already, nothing to do.

  // All of the single-register GCC register classes map their values onto
  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
  // really want an 8-bit or 32-bit register, map to the appropriate register
  // class and return the appropriate register.
  if (Res.second != X86::GR16RegisterClass)
    return Res;

  if (VT == MVT::i8) {
    unsigned DestReg = 0;
    switch (Res.first) {
    default: break;
    case X86::AX: DestReg = X86::AL; break;
    case X86::DX: DestReg = X86::DL; break;
    case X86::CX: DestReg = X86::CL; break;
    case X86::BX: DestReg = X86::BL; break;
    }
    if (DestReg) {
      Res.first = DestReg;
      Res.second = Res.second = X86::GR8RegisterClass;
    }
  } else if (VT == MVT::i32) {
    unsigned DestReg = 0;
    switch (Res.first) {
    default: break;
    case X86::AX: DestReg = X86::EAX; break;
    case X86::DX: DestReg = X86::EDX; break;
    case X86::CX: DestReg = X86::ECX; break;
    case X86::BX: DestReg = X86::EBX; break;
    case X86::SI: DestReg = X86::ESI; break;
    case X86::DI: DestReg = X86::EDI; break;
    case X86::BP: DestReg = X86::EBP; break;
    case X86::SP: DestReg = X86::ESP; break;
    }
    if (DestReg) {
      Res.first = DestReg;
      Res.second = Res.second = X86::GR32RegisterClass;
    }
  } else if (VT == MVT::i64) {
    unsigned DestReg = 0;
    switch (Res.first) {
    default: break;
    case X86::AX: DestReg = X86::RAX; break;
    case X86::DX: DestReg = X86::RDX; break;
    case X86::CX: DestReg = X86::RCX; break;
    case X86::BX: DestReg = X86::RBX; break;
    case X86::SI: DestReg = X86::RSI; break;
    case X86::DI: DestReg = X86::RDI; break;
    case X86::BP: DestReg = X86::RBP; break;
    case X86::SP: DestReg = X86::RSP; break;
    }
    if (DestReg) {
      Res.first = DestReg;
      Res.second = Res.second = X86::GR64RegisterClass;
    }
  }

  return Res;
}