X86ISelLowering.cpp

          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
          if (StackPtr.getNode() == 0)
            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
                                          getPointerTy());
          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);

          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
                                                           ArgChain,
                                                           Flags, DAG, dl));
        } else {
          // Store relative to framepointer.
          MemOpChains2.push_back(
            DAG.getStore(ArgChain, dl, Arg, FIN,
                         PseudoSourceValue::getFixedStack(FI), 0));
        }
      }
    }

    if (!MemOpChains2.empty())
      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                          &MemOpChains2[0], MemOpChains2.size());

    // Copy arguments to their registers.
    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
                               RegsToPass[i].second, InFlag);
      InFlag = Chain.getValue(1);
    }
    InFlag =SDValue();

    // Store the return address to the appropriate stack slot.
    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
                                     FPDiff, dl);
  }

  bool WasGlobalOrExternal = false;
  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
    // In the 64-bit large code model, we have to make all calls
    // through a register, since the call instruction's 32-bit
    // pc-relative offset may not be large enough to hold the whole
    // address.
  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
    WasGlobalOrExternal = true;
    // If the callee is a GlobalAddress node (quite common, every direct call
    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
    // it.

    // We should use extra load for direct calls to dllimported functions in
    // non-JIT mode.
    GlobalValue *GV = G->getGlobal();
    if (!GV->hasDLLImportLinkage()) {
      unsigned char OpFlags = 0;

      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
      // external symbols most go through the PLT in PIC mode.  If the symbol
      // has hidden or protected visibility, or if it is static or local, then
      // we don't need to use the PLT - we can directly call it.
      if (Subtarget->isTargetELF() &&
          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
        OpFlags = X86II::MO_PLT;
      } else if (Subtarget->isPICStyleStubAny() &&
               (GV->isDeclaration() || GV->isWeakForLinker()) &&
               Subtarget->getDarwinVers() < 9) {
        // PC-relative references to external symbols should go through $stub,
        // unless we're building with the leopard linker or later, which
        // automatically synthesizes these stubs.
        OpFlags = X86II::MO_DARWIN_STUB;
      }

      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
                                          G->getOffset(), OpFlags);
    }
  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
    WasGlobalOrExternal = true;
    unsigned char OpFlags = 0;

    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
    // symbols should go through the PLT.
    if (Subtarget->isTargetELF() &&
        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
      OpFlags = X86II::MO_PLT;
    } else if (Subtarget->isPICStyleStubAny() &&
             Subtarget->getDarwinVers() < 9) {
      // PC-relative references to external symbols should go through $stub,
      // unless we're building with the leopard linker or later, which
      // automatically synthesizes these stubs.
      OpFlags = X86II::MO_DARWIN_STUB;
    }

    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
                                         OpFlags);
  }

  if (isTailCall && !WasGlobalOrExternal) {
    // Force the address into a (call preserved) caller-saved register since
    // tailcall must happen after callee-saved registers are poped.
    // FIXME: Give it a special register class that contains caller-saved
    // register instead?
    unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX;
    Chain = DAG.getCopyToReg(Chain,  dl,
                             DAG.getRegister(TCReg, getPointerTy()),
                             Callee,InFlag);
    Callee = DAG.getRegister(TCReg, getPointerTy());
  }

  // Returns a chain & a flag for retval copy to use.
  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
  SmallVector<SDValue, 8> Ops;

  if (isTailCall) {
    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
                           DAG.getIntPtrConstant(0, true), InFlag);
    InFlag = Chain.getValue(1);
  }

  Ops.push_back(Chain);
  Ops.push_back(Callee);

  if (isTailCall)
    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));

  // Add argument registers to the end of the list so that they are known live
  // into the call.
  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
                                  RegsToPass[i].second.getValueType()));

  // Add an implicit use GOT pointer in EBX.
  if (!isTailCall && Subtarget->isPICStyleGOT())
    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));

  // Add an implicit use of AL for x86 vararg functions.
  if (Is64Bit && isVarArg)
    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));

  if (InFlag.getNode())
    Ops.push_back(InFlag);

  if (isTailCall) {
    // If this is the first return lowered for this function, add the regs
    // to the liveout set for the function.
    if (MF.getRegInfo().liveout_empty()) {
      SmallVector<CCValAssign, 16> RVLocs;
      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
                     *DAG.getContext());
      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
      for (unsigned i = 0; i != RVLocs.size(); ++i)
        if (RVLocs[i].isRegLoc())
          MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
    }

    assert(((Callee.getOpcode() == ISD::Register &&
               (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX ||
                cast<RegisterSDNode>(Callee)->getReg() == X86::R11)) ||
              Callee.getOpcode() == ISD::TargetExternalSymbol ||
              Callee.getOpcode() == ISD::TargetGlobalAddress) &&
           "Expecting a global address, external symbol, or scratch register");

    return DAG.getNode(X86ISD::TC_RETURN, dl,
                       NodeTys, &Ops[0], Ops.size());
  }

  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
  InFlag = Chain.getValue(1);

  // Create the CALLSEQ_END node.
  unsigned NumBytesForCalleeToPush;
  if (IsCalleePop(isVarArg, CallConv))
    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
  else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet)
    // If this is is a call to a struct-return function, the callee
    // pops the hidden struct pointer, so we have to push it back.
    // This is common for Darwin/X86, Linux & Mingw32 targets.
    NumBytesForCalleeToPush = 4;
  else
    NumBytesForCalleeToPush = 0;  // Callee pops nothing.

  // Returns a flag for retval copy to use.
  Chain = DAG.getCALLSEQ_END(Chain,
                             DAG.getIntPtrConstant(NumBytes, true),
                             DAG.getIntPtrConstant(NumBytesForCalleeToPush,
                                                   true),
                             InFlag);
  InFlag = Chain.getValue(1);

  // Handle result values, copying them out of physregs into vregs that we
  // return.
  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
                         Ins, dl, DAG, InVals);
}


//===----------------------------------------------------------------------===//
//                Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//

//  Like std call, callee cleans arguments, convention except that ECX is
//  reserved for storing the tail called function address. Only 2 registers are
//  free for argument passing (inreg). Tail call optimization is performed
//  provided:
//                * tailcallopt is enabled
//                * caller/callee are fastcc
//  On X86_64 architecture with GOT-style position independent code only local
//  (within module) calls are supported at the moment.
//  To keep the stack aligned according to platform abi the function
//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
//  If a tail called function callee has more arguments than the caller the
//  caller needs to make sure that there is room to move the RETADDR to. This is
//  achieved by reserving an area the size of the argument delta right after the
//  original REtADDR, but before the saved framepointer or the spilled registers
//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
//  stack layout:
//    arg1
//    arg2
//    RETADDR
//    [ new RETADDR
//      move area ]
//    (possible EBP)
//    ESI
//    EDI
//    local1 ..

/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
/// for a 16 byte align requirement.
unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
                                                        SelectionDAG& DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  const TargetMachine &TM = MF.getTarget();
  const TargetFrameInfo &TFI = *TM.getFrameInfo();
  unsigned StackAlignment = TFI.getStackAlignment();
  uint64_t AlignMask = StackAlignment - 1;
  int64_t Offset = StackSize;
  uint64_t SlotSize = TD->getPointerSize();
  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
    // Number smaller than 12 so just add the difference.
    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
  } else {
    // Mask out lower bits, add stackalignment once plus the 12 bytes.
    Offset = ((~AlignMask) & Offset) + StackAlignment +
      (StackAlignment-SlotSize);
  }
  return Offset;
}

/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function.
bool
X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                     CallingConv::ID CalleeCC,
                                                     bool isVarArg,
                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                                    const SmallVectorImpl<ISD::InputArg> &Ins,
                                                     SelectionDAG& DAG) const {
  if (CalleeCC != CallingConv::Fast &&
      CalleeCC != CallingConv::C)
    return false;

  // If -tailcallopt is specified, make fastcc functions tail-callable.
  const Function *CallerF = DAG.getMachineFunction().getFunction();
  if (PerformTailCallOpt) {
    if (CalleeCC == CallingConv::Fast &&
        CallerF->getCallingConv() == CalleeCC)
      return true;
    return false;
  }

  // Look for obvious safe cases to perform tail call optimization that does not
  // requite ABI changes. This is what gcc calls sibcall.

  // Do not tail call optimize vararg calls for now.
  if (isVarArg)
    return false;

  // If the callee takes no arguments then go on to check the results of the
  // call.
  if (!Outs.empty()) {
    // Check if stack adjustment is needed. For now, do not do this if any
    // argument is passed on the stack.
    SmallVector<CCValAssign, 16> ArgLocs;
    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
                   ArgLocs, *DAG.getContext());
    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
    if (CCInfo.getNextStackOffset()) {
      MachineFunction &MF = DAG.getMachineFunction();
      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
        return false;
      if (Subtarget->isTargetWin64())
        // Win64 ABI has additional complications.
        return false;

      // Check if the arguments are already laid out in the right way as
      // the caller's fixed stack objects.
      MachineFrameInfo *MFI = MF.getFrameInfo();
      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
        CCValAssign &VA = ArgLocs[i];
        EVT RegVT = VA.getLocVT();
        SDValue Arg = Outs[i].Val;
        ISD::ArgFlagsTy Flags = Outs[i].Flags;
        if (Flags.isByVal())
          return false; // TODO
        if (VA.getLocInfo() == CCValAssign::Indirect)
          return false;
        if (!VA.isRegLoc()) {
          LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg);
          if (!Ld)
            return false;
          SDValue Ptr = Ld->getBasePtr();
          FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
          if (!FINode)
            return false;
          int FI = FINode->getIndex();
          if (!MFI->isFixedObjectIndex(FI))
            return false;
          if (VA.getLocMemOffset() != MFI->getObjectOffset(FI))
            return false;
        }
      }
    }
  }

  return true;
}

FastISel *
X86TargetLowering::createFastISel(MachineFunction &mf, MachineModuleInfo *mmo,
                            DwarfWriter *dw,
                            DenseMap<const Value *, unsigned> &vm,
                            DenseMap<const BasicBlock*, MachineBasicBlock*> &bm,
                            DenseMap<const AllocaInst *, int> &am
#ifndef NDEBUG
                          , SmallSet<Instruction*, 8> &cil
#endif
                                  ) {
  return X86::createFastISel(mf, mmo, dw, vm, bm, am
#ifndef NDEBUG
                             , cil
#endif
                             );
}


//===----------------------------------------------------------------------===//
//                           Other Lowering Hooks
//===----------------------------------------------------------------------===//


SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
  MachineFunction &MF = DAG.getMachineFunction();
  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
  int ReturnAddrIndex = FuncInfo->getRAIndex();

  if (ReturnAddrIndex == 0) {
    // Set up a frame object for the return address.
    uint64_t SlotSize = TD->getPointerSize();
    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
                                                           true, false);
    FuncInfo->setRAIndex(ReturnAddrIndex);
  }

  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
}


bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
                                       bool hasSymbolicDisplacement) {
  // Offset should fit into 32 bit immediate field.
  if (!isInt32(Offset))
    return false;

  // If we don't have a symbolic displacement - we don't have any extra
  // restrictions.
  if (!hasSymbolicDisplacement)
    return true;

  // FIXME: Some tweaks might be needed for medium code model.
  if (M != CodeModel::Small && M != CodeModel::Kernel)
    return false;

  // For small code model we assume that latest object is 16MB before end of 31
  // bits boundary. We may also accept pretty large negative constants knowing
  // that all objects are in the positive half of address space.
  if (M == CodeModel::Small && Offset < 16*1024*1024)
    return true;

  // For kernel code model we know that all object resist in the negative half
  // of 32bits address space. We may not accept negative offsets, since they may
  // be just off and we may accept pretty large positive ones.
  if (M == CodeModel::Kernel && Offset > 0)
    return true;

  return false;
}

/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code, returning the condition code and the LHS/RHS of the
/// comparison to make.
static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
  if (!isFP) {
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
        // X > -1   -> X == 0, jump !sign.
        RHS = DAG.getConstant(0, RHS.getValueType());
        return X86::COND_NS;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
        // X < 0   -> X == 0, jump on sign.
        return X86::COND_S;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
        // X < 1   -> X <= 0
        RHS = DAG.getConstant(0, RHS.getValueType());
        return X86::COND_LE;
      }
    }

    switch (SetCCOpcode) {
    default: llvm_unreachable("Invalid integer condition!");
    case ISD::SETEQ:  return X86::COND_E;
    case ISD::SETGT:  return X86::COND_G;
    case ISD::SETGE:  return X86::COND_GE;
    case ISD::SETLT:  return X86::COND_L;
    case ISD::SETLE:  return X86::COND_LE;
    case ISD::SETNE:  return X86::COND_NE;
    case ISD::SETULT: return X86::COND_B;
    case ISD::SETUGT: return X86::COND_A;
    case ISD::SETULE: return X86::COND_BE;
    case ISD::SETUGE: return X86::COND_AE;
    }
  }

  // First determine if it is required or is profitable to flip the operands.

  // If LHS is a foldable load, but RHS is not, flip the condition.
  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
    std::swap(LHS, RHS);
  }

  switch (SetCCOpcode) {
  default: break;
  case ISD::SETOLT:
  case ISD::SETOLE:
  case ISD::SETUGT:
  case ISD::SETUGE:
    std::swap(LHS, RHS);
    break;
  }

  // On a floating point condition, the flags are set as follows:
  // ZF  PF  CF   op
  //  0 | 0 | 0 | X > Y
  //  0 | 0 | 1 | X < Y
  //  1 | 0 | 0 | X == Y
  //  1 | 1 | 1 | unordered
  switch (SetCCOpcode) {
  default: llvm_unreachable("Condcode should be pre-legalized away");
  case ISD::SETUEQ:
  case ISD::SETEQ:   return X86::COND_E;
  case ISD::SETOLT:              // flipped
  case ISD::SETOGT:
  case ISD::SETGT:   return X86::COND_A;
  case ISD::SETOLE:              // flipped
  case ISD::SETOGE:
  case ISD::SETGE:   return X86::COND_AE;
  case ISD::SETUGT:              // flipped
  case ISD::SETULT:
  case ISD::SETLT:   return X86::COND_B;
  case ISD::SETUGE:              // flipped
  case ISD::SETULE:
  case ISD::SETLE:   return X86::COND_BE;
  case ISD::SETONE:
  case ISD::SETNE:   return X86::COND_NE;
  case ISD::SETUO:   return X86::COND_P;
  case ISD::SETO:    return X86::COND_NP;
  case ISD::SETOEQ:
  case ISD::SETUNE:  return X86::COND_INVALID;
  }
}

/// hasFPCMov - is there a floating point cmov for the specific X86 condition
/// code. Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
static bool hasFPCMov(unsigned X86CC) {
  switch (X86CC) {
  default:
    return false;
  case X86::COND_B:
  case X86::COND_BE:
  case X86::COND_E:
  case X86::COND_P:
  case X86::COND_A:
  case X86::COND_AE:
  case X86::COND_NE:
  case X86::COND_NP:
    return true;
  }
}

/// isFPImmLegal - Returns true if the target can instruction select the
/// specified FP immediate natively. If false, the legalizer will
/// materialize the FP immediate as a load from a constant pool.
bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
      return true;
  }
  return false;
}

/// isUndefOrInRange - Return true if Val is undef or if its value falls within
/// the specified range (L, H].
static bool isUndefOrInRange(int Val, int Low, int Hi) {
  return (Val < 0) || (Val >= Low && Val < Hi);
}

/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
/// specified value.
static bool isUndefOrEqual(int Val, int CmpVal) {
  if (Val < 0 || Val == CmpVal)
    return true;
  return false;
}

/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
/// the second operand.
static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
  if (VT == MVT::v2f64 || VT == MVT::v2i64)
    return (Mask[0] < 2 && Mask[1] < 2);
  return false;
}

bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isPSHUFDMask(M, N->getValueType(0));
}

/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFHW.
static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  if (VT != MVT::v8i16)
    return false;

  // Lower quadword copied in order or undef.
  for (int i = 0; i != 4; ++i)
    if (Mask[i] >= 0 && Mask[i] != i)
      return false;

  // Upper quadword shuffled.
  for (int i = 4; i != 8; ++i)
    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
      return false;

  return true;
}

bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isPSHUFHWMask(M, N->getValueType(0));
}

/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PSHUFLW.
static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  if (VT != MVT::v8i16)
    return false;

  // Upper quadword copied in order.
  for (int i = 4; i != 8; ++i)
    if (Mask[i] >= 0 && Mask[i] != i)
      return false;

  // Lower quadword shuffled.
  for (int i = 0; i != 4; ++i)
    if (Mask[i] >= 4)
      return false;

  return true;
}

bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isPSHUFLWMask(M, N->getValueType(0));
}

/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
/// is suitable for input to PALIGNR.
static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
                          bool hasSSSE3) {
  int i, e = VT.getVectorNumElements();
  
  // Do not handle v2i64 / v2f64 shuffles with palignr.
  if (e < 4 || !hasSSSE3)
    return false;
  
  for (i = 0; i != e; ++i)
    if (Mask[i] >= 0)
      break;
  
  // All undef, not a palignr.
  if (i == e)
    return false;

  // Determine if it's ok to perform a palignr with only the LHS, since we
  // don't have access to the actual shuffle elements to see if RHS is undef.
  bool Unary = Mask[i] < (int)e;
  bool NeedsUnary = false;

  int s = Mask[i] - i;
  
  // Check the rest of the elements to see if they are consecutive.
  for (++i; i != e; ++i) {
    int m = Mask[i];
    if (m < 0) 
      continue;
    
    Unary = Unary && (m < (int)e);
    NeedsUnary = NeedsUnary || (m < s);

    if (NeedsUnary && !Unary)
      return false;
    if (Unary && m != ((s+i) & (e-1)))
      return false;
    if (!Unary && m != (s+i))
      return false;
  }
  return true;
}

bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isPALIGNRMask(M, N->getValueType(0), true);
}

/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to SHUFP*.
static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  int NumElems = VT.getVectorNumElements();
  if (NumElems != 2 && NumElems != 4)
    return false;

  int Half = NumElems / 2;
  for (int i = 0; i < Half; ++i)
    if (!isUndefOrInRange(Mask[i], 0, NumElems))
      return false;
  for (int i = Half; i < NumElems; ++i)
    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
      return false;

  return true;
}

bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isSHUFPMask(M, N->getValueType(0));
}

/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
/// half elements to come from vector 1 (which would equal the dest.) and
/// the upper half to come from vector 2.
static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  int NumElems = VT.getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;

  int Half = NumElems / 2;
  for (int i = 0; i < Half; ++i)
    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
      return false;
  for (int i = Half; i < NumElems; ++i)
    if (!isUndefOrInRange(Mask[i], 0, NumElems))
      return false;
  return true;
}

static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return isCommutedSHUFPMask(M, N->getValueType(0));
}

/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
  if (N->getValueType(0).getVectorNumElements() != 4)
    return false;

  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
  return isUndefOrEqual(N->getMaskElt(0), 6) &&
         isUndefOrEqual(N->getMaskElt(1), 7) &&
         isUndefOrEqual(N->getMaskElt(2), 2) &&
         isUndefOrEqual(N->getMaskElt(3), 3);
}

/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
/// <2, 3, 2, 3>
bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
  unsigned NumElems = N->getValueType(0).getVectorNumElements();
  
  if (NumElems != 4)
    return false;
  
  return isUndefOrEqual(N->getMaskElt(0), 2) &&
  isUndefOrEqual(N->getMaskElt(1), 3) &&
  isUndefOrEqual(N->getMaskElt(2), 2) &&
  isUndefOrEqual(N->getMaskElt(3), 3);
}

/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
  unsigned NumElems = N->getValueType(0).getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;

  for (unsigned i = 0; i < NumElems/2; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
      return false;

  for (unsigned i = NumElems/2; i < NumElems; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i), i))
      return false;

  return true;
}

/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
  unsigned NumElems = N->getValueType(0).getVectorNumElements();

  if (NumElems != 2 && NumElems != 4)
    return false;

  for (unsigned i = 0; i < NumElems/2; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i), i))
      return false;

  for (unsigned i = 0; i < NumElems/2; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
      return false;

  return true;
}

/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                         bool V2IsSplat = false) {
  int NumElts = VT.getVectorNumElements();
  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
    return false;

  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
    int BitI  = Mask[i];
    int BitI1 = Mask[i+1];
    if (!isUndefOrEqual(BitI, j))
      return false;
    if (V2IsSplat) {
      if (!isUndefOrEqual(BitI1, NumElts))
        return false;
    } else {
      if (!isUndefOrEqual(BitI1, j + NumElts))
        return false;
    }
  }
  return true;
}

bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
}

/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKH.
static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
                         bool V2IsSplat = false) {
  int NumElts = VT.getVectorNumElements();
  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
    return false;

  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
    int BitI  = Mask[i];
    int BitI1 = Mask[i+1];
    if (!isUndefOrEqual(BitI, j + NumElts/2))
      return false;
    if (V2IsSplat) {
      if (isUndefOrEqual(BitI1, NumElts))
        return false;
    } else {
      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
        return false;
    }
  }
  return true;
}

bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
}

/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
/// <0, 0, 1, 1>
static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
  int NumElems = VT.getVectorNumElements();
  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
    int BitI  = Mask[i];
    int BitI1 = Mask[i+1];
    if (!isUndefOrEqual(BitI, j))
      return false;
    if (!isUndefOrEqual(BitI1, j))
      return false;
  }
  return true;
}

bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
}

/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
/// <2, 2, 3, 3>
static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
  int NumElems = VT.getVectorNumElements();
  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
    int BitI  = Mask[i];
    int BitI1 = Mask[i+1];
    if (!isUndefOrEqual(BitI, j))
      return false;
    if (!isUndefOrEqual(BitI1, j))
      return false;
  }
  return true;
}

bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
}

/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
  if (VT.getVectorElementType().getSizeInBits() < 32)
    return false;

  int NumElts = VT.getVectorNumElements();

  if (!isUndefOrEqual(Mask[0], NumElts))
    return false;

  for (int i = 1; i < NumElts; ++i)
    if (!isUndefOrEqual(Mask[i], i))
      return false;

  return true;
}

bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return ::isMOVLMask(M, N->getValueType(0));
}

/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
                               bool V2IsSplat = false, bool V2IsUndef = false) {
  int NumOps = VT.getVectorNumElements();
  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
    return false;

  if (!isUndefOrEqual(Mask[0], 0))
    return false;

  for (int i = 1; i < NumOps; ++i)
    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
      return false;

  return true;
}

static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
                           bool V2IsUndef = false) {
  SmallVector<int, 8> M;
  N->getMask(M);
  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
}

/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
  if (N->getValueType(0).getVectorNumElements() != 4)
    return false;

  // Expect 1, 1, 3, 3
  for (unsigned i = 0; i < 2; ++i) {
    int Elt = N->getMaskElt(i);
    if (Elt >= 0 && Elt != 1)
      return false;
  }

  bool HasHi = false;
  for (unsigned i = 2; i < 4; ++i) {
    int Elt = N->getMaskElt(i);
    if (Elt >= 0 && Elt != 3)
      return false;
    if (Elt == 3)
      HasHi = true;
  }
  // Don't use movshdup if it can be done with a shufps.
  // FIXME: verify that matching u, u, 3, 3 is what we want.
  return HasHi;
}

/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
  if (N->getValueType(0).getVectorNumElements() != 4)
    return false;

  // Expect 0, 0, 2, 2
  for (unsigned i = 0; i < 2; ++i)
    if (N->getMaskElt(i) > 0)
      return false;

  bool HasHi = false;
  for (unsigned i = 2; i < 4; ++i) {
    int Elt = N->getMaskElt(i);
    if (Elt >= 0 && Elt != 2)
      return false;
    if (Elt == 2)
      HasHi = true;
  }
  // Don't use movsldup if it can be done with a shufps.
  return HasHi;
}

/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
  int e = N->getValueType(0).getVectorNumElements() / 2;

  for (int i = 0; i < e; ++i)
    if (!isUndefOrEqual(N->getMaskElt(i), i))
      return false;
  for (int i = 0; i < e; ++i)
    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
      return false;
  return true;
}

/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
  int NumOperands = SVOp->getValueType(0).getVectorNumElements();

  unsigned Shift = (NumOperands == 4) ? 2 : 1;
  unsigned Mask = 0;
  for (int i = 0; i < NumOperands; ++i) {
    int Val = SVOp->getMaskElt(NumOperands-i-1);
    if (Val < 0) Val = 0;
    if (Val >= NumOperands) Val -= NumOperands;
    Mask |= Val;
    if (i != NumOperands - 1)