X86ISelLowering.cpp

  if (isVarArg) {
    NumBytesForCalleeToPush = 0;
  } else {
    NumBytesForCalleeToPush = NumBytes;
  }

  NodeTys.clear();
  NodeTys.push_back(MVT::Other);   // Returns a chain
  if (RetVT != MVT::Other)
    NodeTys.push_back(MVT::Flag);  // Returns a flag for retval copy to use.
  Ops.clear();
  Ops.push_back(Chain);
  Ops.push_back(DAG.getConstant(NumBytes, getPointerTy()));
  Ops.push_back(DAG.getConstant(NumBytesForCalleeToPush, getPointerTy()));
  Ops.push_back(InFlag);
  Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
  if (RetVT != MVT::Other)
    InFlag = Chain.getValue(1);
  
  std::vector<SDOperand> ResultVals;
  NodeTys.clear();
  switch (RetVT) {
  default: assert(0 && "Unknown value type to return!");
  case MVT::Other: break;
  case MVT::i8:
    Chain = DAG.getCopyFromReg(Chain, X86::AL, MVT::i8, InFlag).getValue(1);
    ResultVals.push_back(Chain.getValue(0));
    NodeTys.push_back(MVT::i8);
    break;
  case MVT::i16:
    Chain = DAG.getCopyFromReg(Chain, X86::AX, MVT::i16, InFlag).getValue(1);
    ResultVals.push_back(Chain.getValue(0));
    NodeTys.push_back(MVT::i16);
    break;
  case MVT::i32:
    if (Op.Val->getValueType(1) == MVT::i32) {
      Chain = DAG.getCopyFromReg(Chain, X86::EAX, MVT::i32, InFlag).getValue(1);
      ResultVals.push_back(Chain.getValue(0));
      Chain = DAG.getCopyFromReg(Chain, X86::EDX, MVT::i32,
                                 Chain.getValue(2)).getValue(1);
      ResultVals.push_back(Chain.getValue(0));
      NodeTys.push_back(MVT::i32);
    } else {
      Chain = DAG.getCopyFromReg(Chain, X86::EAX, MVT::i32, InFlag).getValue(1);
      ResultVals.push_back(Chain.getValue(0));
    }
    NodeTys.push_back(MVT::i32);
    break;
  case MVT::f32:
  case MVT::f64: {
    std::vector<MVT::ValueType> Tys;
    Tys.push_back(MVT::f64);
    Tys.push_back(MVT::Other);
    Tys.push_back(MVT::Flag);
    std::vector<SDOperand> Ops;
    Ops.push_back(Chain);
    Ops.push_back(InFlag);
    SDOperand RetVal = DAG.getNode(X86ISD::FP_GET_RESULT, Tys, 
                                   &Ops[0], Ops.size());
    Chain  = RetVal.getValue(1);
    InFlag = RetVal.getValue(2);
    if (X86ScalarSSE) {
      // FIXME: Currently the FST is flagged to the FP_GET_RESULT. This
      // shouldn't be necessary except that RFP cannot be live across
      // multiple blocks. When stackifier is fixed, they can be uncoupled.
      MachineFunction &MF = DAG.getMachineFunction();
      int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8);
      SDOperand StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
      Tys.clear();
      Tys.push_back(MVT::Other);
      Ops.clear();
      Ops.push_back(Chain);
      Ops.push_back(RetVal);
      Ops.push_back(StackSlot);
      Ops.push_back(DAG.getValueType(RetVT));
      Ops.push_back(InFlag);
      Chain = DAG.getNode(X86ISD::FST, Tys, &Ops[0], Ops.size());
      RetVal = DAG.getLoad(RetVT, Chain, StackSlot, NULL, 0);
      Chain = RetVal.getValue(1);
    }

    if (RetVT == MVT::f32 && !X86ScalarSSE)
      // FIXME: we would really like to remember that this FP_ROUND
      // operation is okay to eliminate if we allow excess FP precision.
      RetVal = DAG.getNode(ISD::FP_ROUND, MVT::f32, RetVal);
    ResultVals.push_back(RetVal);
    NodeTys.push_back(RetVT);
    break;
  }
  }

  // If the function returns void, just return the chain.
  if (ResultVals.empty())
    return Chain;
  
  // Otherwise, merge everything together with a MERGE_VALUES node.
  NodeTys.push_back(MVT::Other);
  ResultVals.push_back(Chain);
  SDOperand Res = DAG.getNode(ISD::MERGE_VALUES, NodeTys,
                              &ResultVals[0], ResultVals.size());
  return Res.getValue(Op.ResNo);
}

//===----------------------------------------------------------------------===//
//                  FastCall Calling Convention implementation
//===----------------------------------------------------------------------===//
//
// The X86 'fastcall' calling convention passes up to two integer arguments in
// registers (an appropriate portion of ECX/EDX), passes arguments in C order,
// and requires that the callee pop its arguments off the stack (allowing proper
// tail calls), and has the same return value conventions as C calling convs.
//
// This calling convention always arranges for the callee pop value to be 8n+4
// bytes, which is needed for tail recursion elimination and stack alignment
// reasons.
//

/// HowToPassFastCallCCArgument - Returns how an formal argument of the
/// specified type should be passed. If it is through stack, returns the size of
/// the stack slot; if it is through integer register, returns the number of
/// integer registers are needed.
static void
HowToPassFastCallCCArgument(MVT::ValueType ObjectVT,
                            unsigned NumIntRegs,
                            unsigned &ObjSize,
                            unsigned &ObjIntRegs)
{
  ObjSize = 0;
  ObjIntRegs = 0;

  switch (ObjectVT) {
  default: assert(0 && "Unhandled argument type!");
  case MVT::i8:
   if (NumIntRegs < 2)
     ObjIntRegs = 1;
   else
     ObjSize = 1;
   break;
  case MVT::i16:
   if (NumIntRegs < 2)
     ObjIntRegs = 1;
   else
     ObjSize = 2;
   break;
  case MVT::i32:
   if (NumIntRegs < 2)
     ObjIntRegs = 1;
   else
     ObjSize = 4;
    break;
  case MVT::i64:
   if (NumIntRegs+2 <= 2) {
     ObjIntRegs = 2;
   } else if (NumIntRegs+1 <= 2) {
     ObjIntRegs = 1;
     ObjSize = 4;
   } else
     ObjSize = 8;
   case MVT::f32:
    ObjSize = 4;
    break;
   case MVT::f64:
    ObjSize = 8;
    break;
  }
}

SDOperand
X86TargetLowering::LowerFastCallCCArguments(SDOperand Op, SelectionDAG &DAG) {
  unsigned NumArgs = Op.Val->getNumValues()-1;
  MachineFunction &MF = DAG.getMachineFunction();
  MachineFrameInfo *MFI = MF.getFrameInfo();
  SDOperand Root = Op.getOperand(0);
  std::vector<SDOperand> ArgValues;

  // Add DAG nodes to load the arguments...  On entry to a function the stack
  // frame looks like this:
  //
  // [ESP] -- return address
  // [ESP + 4] -- first nonreg argument (leftmost lexically)
  // [ESP + 8] -- second nonreg argument, if 1st argument is <= 4 bytes in size
  //    ...
  unsigned ArgOffset = 0;   // Frame mechanisms handle retaddr slot

  // Keep track of the number of integer regs passed so far.  This can be either
  // 0 (neither ECX or EDX used), 1 (ECX is used) or 2 (ECX and EDX are both
  // used).
  unsigned NumIntRegs = 0;

  for (unsigned i = 0; i < NumArgs; ++i) {
    MVT::ValueType ObjectVT = Op.getValue(i).getValueType();
    unsigned ArgIncrement = 4;
    unsigned ObjSize = 0;
    unsigned ObjIntRegs = 0;

    HowToPassFastCallCCArgument(ObjectVT, NumIntRegs, ObjSize, ObjIntRegs);
    if (ObjSize > 4)
      ArgIncrement = ObjSize;

    unsigned Reg = 0;
    SDOperand ArgValue;
    if (ObjIntRegs) {
      switch (ObjectVT) {
      default: assert(0 && "Unhandled argument type!");
      case MVT::i8:
        Reg = AddLiveIn(MF, NumIntRegs ? X86::DL : X86::CL,
                        X86::GR8RegisterClass);
        ArgValue = DAG.getCopyFromReg(Root, Reg, MVT::i8);
        break;
      case MVT::i16:
        Reg = AddLiveIn(MF, NumIntRegs ? X86::DX : X86::CX,
                        X86::GR16RegisterClass);
        ArgValue = DAG.getCopyFromReg(Root, Reg, MVT::i16);
        break;
      case MVT::i32:
        Reg = AddLiveIn(MF, NumIntRegs ? X86::EDX : X86::ECX,
                        X86::GR32RegisterClass);
        ArgValue = DAG.getCopyFromReg(Root, Reg, MVT::i32);
        break;
      case MVT::i64:
        Reg = AddLiveIn(MF, NumIntRegs ? X86::EDX : X86::ECX,
                        X86::GR32RegisterClass);
        ArgValue = DAG.getCopyFromReg(Root, Reg, MVT::i32);
        if (ObjIntRegs == 2) {
          Reg = AddLiveIn(MF, X86::EDX, X86::GR32RegisterClass);
          SDOperand ArgValue2 = DAG.getCopyFromReg(Root, Reg, MVT::i32);
          ArgValue= DAG.getNode(ISD::BUILD_PAIR, MVT::i64, ArgValue, ArgValue2);
        }
        break;
      }
      
      NumIntRegs += ObjIntRegs;
    }

    if (ObjSize) {
      // Create the SelectionDAG nodes corresponding to a load from this
      // parameter.
      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset);
      SDOperand FIN = DAG.getFrameIndex(FI, getPointerTy());
      if (ObjectVT == MVT::i64 && ObjIntRegs) {
        SDOperand ArgValue2 = DAG.getLoad(Op.Val->getValueType(i), Root, FIN,
                                          NULL, 0);
        ArgValue = DAG.getNode(ISD::BUILD_PAIR, MVT::i64, ArgValue, ArgValue2);
      } else
        ArgValue = DAG.getLoad(Op.Val->getValueType(i), Root, FIN, NULL, 0);
      ArgOffset += ArgIncrement;   // Move on to the next argument.
    }

    ArgValues.push_back(ArgValue);
  }

  ArgValues.push_back(Root);

  // Make sure the instruction takes 8n+4 bytes to make sure the start of the
  // arguments and the arguments after the retaddr has been pushed are aligned.
  if ((ArgOffset & 7) == 0)
    ArgOffset += 4;

  VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
  RegSaveFrameIndex = 0xAAAAAAA;   // X86-64 only.
  ReturnAddrIndex = 0;             // No return address slot generated yet.
  BytesToPopOnReturn = ArgOffset;  // Callee pops all stack arguments.
  BytesCallerReserves = 0;

  MF.getInfo<X86FunctionInfo>()->setBytesToPopOnReturn(BytesToPopOnReturn);

  // Finally, inform the code generator which regs we return values in.
  switch (getValueType(MF.getFunction()->getReturnType())) {
  default: assert(0 && "Unknown type!");
  case MVT::isVoid: break;
  case MVT::i1:
  case MVT::i8:
  case MVT::i16:
  case MVT::i32:
    MF.addLiveOut(X86::ECX);
    break;
  case MVT::i64:
    MF.addLiveOut(X86::ECX);
    MF.addLiveOut(X86::EDX);
    break;
  case MVT::f32:
  case MVT::f64:
    MF.addLiveOut(X86::ST0);
    break;
  }

  // Return the new list of results.
  std::vector<MVT::ValueType> RetVTs(Op.Val->value_begin(),
                                     Op.Val->value_end());
  return DAG.getNode(ISD::MERGE_VALUES, RetVTs, &ArgValues[0],ArgValues.size());
}

SDOperand X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
  if (ReturnAddrIndex == 0) {
    // Set up a frame object for the return address.
    MachineFunction &MF = DAG.getMachineFunction();
    if (Subtarget->is64Bit())
      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(8, -8);
    else
      ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(4, -4);
  }

  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
}


std::pair<SDOperand, SDOperand> X86TargetLowering::
LowerFrameReturnAddress(bool isFrameAddress, SDOperand Chain, unsigned Depth,
                        SelectionDAG &DAG) {
  SDOperand Result;
  if (Depth)        // Depths > 0 not supported yet!
    Result = DAG.getConstant(0, getPointerTy());
  else {
    SDOperand RetAddrFI = getReturnAddressFrameIndex(DAG);
    if (!isFrameAddress)
      // Just load the return address
      Result = DAG.getLoad(getPointerTy(), DAG.getEntryNode(), RetAddrFI,
                           NULL, 0);
    else
      Result = DAG.getNode(ISD::SUB, getPointerTy(), RetAddrFI,
                           DAG.getConstant(4, getPointerTy()));
  }
  return std::make_pair(Result, Chain);
}

/// translateX86CC - do a one to one translation of a ISD::CondCode to the X86
/// specific condition code. It returns a false if it cannot do a direct
/// translation. X86CC is the translated CondCode.  LHS/RHS are modified as
/// needed.
static bool translateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
                           unsigned &X86CC, SDOperand &LHS, SDOperand &RHS,
                           SelectionDAG &DAG) {
  X86CC = X86::COND_INVALID;
  if (!isFP) {
    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
        // X > -1   -> X == 0, jump !sign.
        RHS = DAG.getConstant(0, RHS.getValueType());
        X86CC = X86::COND_NS;
        return true;
      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
        // X < 0   -> X == 0, jump on sign.
        X86CC = X86::COND_S;
        return true;
      }
    }
    
    switch (SetCCOpcode) {
    default: break;
    case ISD::SETEQ:  X86CC = X86::COND_E;  break;
    case ISD::SETGT:  X86CC = X86::COND_G;  break;
    case ISD::SETGE:  X86CC = X86::COND_GE; break;
    case ISD::SETLT:  X86CC = X86::COND_L;  break;
    case ISD::SETLE:  X86CC = X86::COND_LE; break;
    case ISD::SETNE:  X86CC = X86::COND_NE; break;
    case ISD::SETULT: X86CC = X86::COND_B;  break;
    case ISD::SETUGT: X86CC = X86::COND_A;  break;
    case ISD::SETULE: X86CC = X86::COND_BE; break;
    case ISD::SETUGE: X86CC = X86::COND_AE; break;
    }
  } else {
    // On a floating point condition, the flags are set as follows:
    // ZF  PF  CF   op
    //  0 | 0 | 0 | X > Y
    //  0 | 0 | 1 | X < Y
    //  1 | 0 | 0 | X == Y
    //  1 | 1 | 1 | unordered
    bool Flip = false;
    switch (SetCCOpcode) {
    default: break;
    case ISD::SETUEQ:
    case ISD::SETEQ: X86CC = X86::COND_E;  break;
    case ISD::SETOLT: Flip = true; // Fallthrough
    case ISD::SETOGT:
    case ISD::SETGT: X86CC = X86::COND_A;  break;
    case ISD::SETOLE: Flip = true; // Fallthrough
    case ISD::SETOGE:
    case ISD::SETGE: X86CC = X86::COND_AE; break;
    case ISD::SETUGT: Flip = true; // Fallthrough
    case ISD::SETULT:
    case ISD::SETLT: X86CC = X86::COND_B;  break;
    case ISD::SETUGE: Flip = true; // Fallthrough
    case ISD::SETULE:
    case ISD::SETLE: X86CC = X86::COND_BE; break;
    case ISD::SETONE:
    case ISD::SETNE: X86CC = X86::COND_NE; break;
    case ISD::SETUO: X86CC = X86::COND_P;  break;
    case ISD::SETO:  X86CC = X86::COND_NP; break;
    }
    if (Flip)
      std::swap(LHS, RHS);
  }

  return X86CC != X86::COND_INVALID;
}

/// hasFPCMov - is there a floating point cmov for the specific X86 condition
/// code. Current x86 isa includes the following FP cmov instructions:
/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
static bool hasFPCMov(unsigned X86CC) {
  switch (X86CC) {
  default:
    return false;
  case X86::COND_B:
  case X86::COND_BE:
  case X86::COND_E:
  case X86::COND_P:
  case X86::COND_A:
  case X86::COND_AE:
  case X86::COND_NE:
  case X86::COND_NP:
    return true;
  }
}

/// DarwinGVRequiresExtraLoad - true if accessing the GV requires an extra
/// load. For Darwin, external and weak symbols are indirect, loading the value
/// at address GV rather then the value of GV itself. This means that the
/// GlobalAddress must be in the base or index register of the address, not the
/// GV offset field.
static bool DarwinGVRequiresExtraLoad(GlobalValue *GV) {
  return (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
          (GV->isExternal() && !GV->hasNotBeenReadFromBytecode()));
}

/// WindowsGVRequiresExtraLoad - true if accessing the GV requires an extra
/// load. For Windows, dllimported symbols are indirect, loading the value at
/// address GV rather then the value of GV itself. This means that the
/// GlobalAddress must be in the base or index register of the address, not the
/// GV offset field.
static bool WindowsGVRequiresExtraLoad(GlobalValue *GV) {
  return (GV->hasDLLImportLinkage());  
}

/// isUndefOrInRange - Op is either an undef node or a ConstantSDNode.  Return
/// true if Op is undef or if its value falls within the specified range (L, H].
static bool isUndefOrInRange(SDOperand Op, unsigned Low, unsigned Hi) {
  if (Op.getOpcode() == ISD::UNDEF)
    return true;

  unsigned Val = cast<ConstantSDNode>(Op)->getValue();
  return (Val >= Low && Val < Hi);
}

/// isUndefOrEqual - Op is either an undef node or a ConstantSDNode.  Return
/// true if Op is undef or if its value equal to the specified value.
static bool isUndefOrEqual(SDOperand Op, unsigned Val) {
  if (Op.getOpcode() == ISD::UNDEF)
    return true;
  return cast<ConstantSDNode>(Op)->getValue() == Val;
}

/// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to PSHUFD.
bool X86::isPSHUFDMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 4)
    return false;

  // Check if the value doesn't reference the second vector.
  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    if (cast<ConstantSDNode>(Arg)->getValue() >= 4)
      return false;
  }

  return true;
}

/// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to PSHUFHW.
bool X86::isPSHUFHWMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 8)
    return false;

  // Lower quadword copied in order.
  for (unsigned i = 0; i != 4; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    if (cast<ConstantSDNode>(Arg)->getValue() != i)
      return false;
  }

  // Upper quadword shuffled.
  for (unsigned i = 4; i != 8; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val < 4 || Val > 7)
      return false;
  }

  return true;
}

/// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to PSHUFLW.
bool X86::isPSHUFLWMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 8)
    return false;

  // Upper quadword copied in order.
  for (unsigned i = 4; i != 8; ++i)
    if (!isUndefOrEqual(N->getOperand(i), i))
      return false;

  // Lower quadword shuffled.
  for (unsigned i = 0; i != 4; ++i)
    if (!isUndefOrInRange(N->getOperand(i), 0, 4))
      return false;

  return true;
}

/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to SHUFP*.
static bool isSHUFPMask(std::vector<SDOperand> &N) {
  unsigned NumElems = N.size();
  if (NumElems != 2 && NumElems != 4) return false;

  unsigned Half = NumElems / 2;
  for (unsigned i = 0; i < Half; ++i)
    if (!isUndefOrInRange(N[i], 0, NumElems))
      return false;
  for (unsigned i = Half; i < NumElems; ++i)
    if (!isUndefOrInRange(N[i], NumElems, NumElems*2))
      return false;

  return true;
}

bool X86::isSHUFPMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);
  std::vector<SDOperand> Ops(N->op_begin(), N->op_end());
  return ::isSHUFPMask(Ops);
}

/// isCommutedSHUFP - Returns true if the shuffle mask is except
/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
/// half elements to come from vector 1 (which would equal the dest.) and
/// the upper half to come from vector 2.
static bool isCommutedSHUFP(std::vector<SDOperand> &Ops) {
  unsigned NumElems = Ops.size();
  if (NumElems != 2 && NumElems != 4) return false;

  unsigned Half = NumElems / 2;
  for (unsigned i = 0; i < Half; ++i)
    if (!isUndefOrInRange(Ops[i], NumElems, NumElems*2))
      return false;
  for (unsigned i = Half; i < NumElems; ++i)
    if (!isUndefOrInRange(Ops[i], 0, NumElems))
      return false;
  return true;
}

static bool isCommutedSHUFP(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);
  std::vector<SDOperand> Ops(N->op_begin(), N->op_end());
  return isCommutedSHUFP(Ops);
}

/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
bool X86::isMOVHLPSMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 4)
    return false;

  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
  return isUndefOrEqual(N->getOperand(0), 6) &&
         isUndefOrEqual(N->getOperand(1), 7) &&
         isUndefOrEqual(N->getOperand(2), 2) &&
         isUndefOrEqual(N->getOperand(3), 3);
}

/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
bool X86::isMOVLPMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  unsigned NumElems = N->getNumOperands();
  if (NumElems != 2 && NumElems != 4)
    return false;

  for (unsigned i = 0; i < NumElems/2; ++i)
    if (!isUndefOrEqual(N->getOperand(i), i + NumElems))
      return false;

  for (unsigned i = NumElems/2; i < NumElems; ++i)
    if (!isUndefOrEqual(N->getOperand(i), i))
      return false;

  return true;
}

/// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVHP{S|D}
/// and MOVLHPS.
bool X86::isMOVHPMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  unsigned NumElems = N->getNumOperands();
  if (NumElems != 2 && NumElems != 4)
    return false;

  for (unsigned i = 0; i < NumElems/2; ++i)
    if (!isUndefOrEqual(N->getOperand(i), i))
      return false;

  for (unsigned i = 0; i < NumElems/2; ++i) {
    SDOperand Arg = N->getOperand(i + NumElems/2);
    if (!isUndefOrEqual(Arg, i + NumElems))
      return false;
  }

  return true;
}

/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKL.
bool static isUNPCKLMask(std::vector<SDOperand> &N, bool V2IsSplat = false) {
  unsigned NumElems = N.size();
  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) {
    SDOperand BitI  = N[i];
    SDOperand BitI1 = N[i+1];
    if (!isUndefOrEqual(BitI, j))
      return false;
    if (V2IsSplat) {
      if (isUndefOrEqual(BitI1, NumElems))
        return false;
    } else {
      if (!isUndefOrEqual(BitI1, j + NumElems))
        return false;
    }
  }

  return true;
}

bool X86::isUNPCKLMask(SDNode *N, bool V2IsSplat) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);
  std::vector<SDOperand> Ops(N->op_begin(), N->op_end());
  return ::isUNPCKLMask(Ops, V2IsSplat);
}

/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to UNPCKH.
bool static isUNPCKHMask(std::vector<SDOperand> &N, bool V2IsSplat = false) {
  unsigned NumElems = N.size();
  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) {
    SDOperand BitI  = N[i];
    SDOperand BitI1 = N[i+1];
    if (!isUndefOrEqual(BitI, j + NumElems/2))
      return false;
    if (V2IsSplat) {
      if (isUndefOrEqual(BitI1, NumElems))
        return false;
    } else {
      if (!isUndefOrEqual(BitI1, j + NumElems/2 + NumElems))
        return false;
    }
  }

  return true;
}

bool X86::isUNPCKHMask(SDNode *N, bool V2IsSplat) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);
  std::vector<SDOperand> Ops(N->op_begin(), N->op_end());
  return ::isUNPCKHMask(Ops, V2IsSplat);
}

/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
/// <0, 0, 1, 1>
bool X86::isUNPCKL_v_undef_Mask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  unsigned NumElems = N->getNumOperands();
  if (NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  for (unsigned i = 0, j = 0; i != NumElems; i += 2, ++j) {
    SDOperand BitI  = N->getOperand(i);
    SDOperand BitI1 = N->getOperand(i+1);

    if (!isUndefOrEqual(BitI, j))
      return false;
    if (!isUndefOrEqual(BitI1, j))
      return false;
  }

  return true;
}

/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSS,
/// MOVSD, and MOVD, i.e. setting the lowest element.
static bool isMOVLMask(std::vector<SDOperand> &N) {
  unsigned NumElems = N.size();
  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  if (!isUndefOrEqual(N[0], NumElems))
    return false;

  for (unsigned i = 1; i < NumElems; ++i) {
    SDOperand Arg = N[i];
    if (!isUndefOrEqual(Arg, i))
      return false;
  }

  return true;
}

bool X86::isMOVLMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);
  std::vector<SDOperand> Ops(N->op_begin(), N->op_end());
  return ::isMOVLMask(Ops);
}

/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
/// element of vector 2 and the other elements to come from vector 1 in order.
static bool isCommutedMOVL(std::vector<SDOperand> &Ops, bool V2IsSplat = false,
                           bool V2IsUndef = false) {
  unsigned NumElems = Ops.size();
  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
    return false;

  if (!isUndefOrEqual(Ops[0], 0))
    return false;

  for (unsigned i = 1; i < NumElems; ++i) {
    SDOperand Arg = Ops[i];
    if (!(isUndefOrEqual(Arg, i+NumElems) ||
          (V2IsUndef && isUndefOrInRange(Arg, NumElems, NumElems*2)) ||
          (V2IsSplat && isUndefOrEqual(Arg, NumElems))))
      return false;
  }

  return true;
}

static bool isCommutedMOVL(SDNode *N, bool V2IsSplat = false,
                           bool V2IsUndef = false) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);
  std::vector<SDOperand> Ops(N->op_begin(), N->op_end());
  return isCommutedMOVL(Ops, V2IsSplat, V2IsUndef);
}

/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
bool X86::isMOVSHDUPMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 4)
    return false;

  // Expect 1, 1, 3, 3
  for (unsigned i = 0; i < 2; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val != 1) return false;
  }

  bool HasHi = false;
  for (unsigned i = 2; i < 4; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val != 3) return false;
    HasHi = true;
  }

  // Don't use movshdup if it can be done with a shufps.
  return HasHi;
}

/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
bool X86::isMOVSLDUPMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 4)
    return false;

  // Expect 0, 0, 2, 2
  for (unsigned i = 0; i < 2; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val != 0) return false;
  }

  bool HasHi = false;
  for (unsigned i = 2; i < 4; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val != 2) return false;
    HasHi = true;
  }

  // Don't use movshdup if it can be done with a shufps.
  return HasHi;
}

/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
/// a splat of a single element.
static bool isSplatMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  // This is a splat operation if each element of the permute is the same, and
  // if the value doesn't reference the second vector.
  unsigned NumElems = N->getNumOperands();
  SDOperand ElementBase;
  unsigned i = 0;
  for (; i != NumElems; ++i) {
    SDOperand Elt = N->getOperand(i);
    if (ConstantSDNode *EltV = dyn_cast<ConstantSDNode>(Elt)) {
      ElementBase = Elt;
      break;
    }
  }

  if (!ElementBase.Val)
    return false;

  for (; i != NumElems; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    if (Arg != ElementBase) return false;
  }

  // Make sure it is a splat of the first vector operand.
  return cast<ConstantSDNode>(ElementBase)->getValue() < NumElems;
}

/// isSplatMask - Return true if the specified VECTOR_SHUFFLE operand specifies
/// a splat of a single element and it's a 2 or 4 element mask.
bool X86::isSplatMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  // We can only splat 64-bit, and 32-bit quantities with a single instruction.
  if (N->getNumOperands() != 4 && N->getNumOperands() != 2)
    return false;
  return ::isSplatMask(N);
}

/// isSplatLoMask - Return true if the specified VECTOR_SHUFFLE operand
/// specifies a splat of zero element.
bool X86::isSplatLoMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i) 
    if (!isUndefOrEqual(N->getOperand(i), 0))
      return false;
  return true;
}

/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
/// instructions.
unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
  unsigned NumOperands = N->getNumOperands();
  unsigned Shift = (NumOperands == 4) ? 2 : 1;
  unsigned Mask = 0;
  for (unsigned i = 0; i < NumOperands; ++i) {
    unsigned Val = 0;
    SDOperand Arg = N->getOperand(NumOperands-i-1);
    if (Arg.getOpcode() != ISD::UNDEF)
      Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val >= NumOperands) Val -= NumOperands;
    Mask |= Val;
    if (i != NumOperands - 1)
      Mask <<= Shift;
  }

  return Mask;
}

/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFHW
/// instructions.
unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
  unsigned Mask = 0;
  // 8 nodes, but we only care about the last 4.
  for (unsigned i = 7; i >= 4; --i) {
    unsigned Val = 0;
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() != ISD::UNDEF)
      Val = cast<ConstantSDNode>(Arg)->getValue();
    Mask |= (Val - 4);
    if (i != 4)
      Mask <<= 2;
  }

  return Mask;
}

/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
/// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUFLW
/// instructions.
unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
  unsigned Mask = 0;
  // 8 nodes, but we only care about the first 4.
  for (int i = 3; i >= 0; --i) {
    unsigned Val = 0;
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() != ISD::UNDEF)
      Val = cast<ConstantSDNode>(Arg)->getValue();
    Mask |= Val;
    if (i != 0)
      Mask <<= 2;
  }

  return Mask;
}

/// isPSHUFHW_PSHUFLWMask - true if the specified VECTOR_SHUFFLE operand
/// specifies a 8 element shuffle that can be broken into a pair of
/// PSHUFHW and PSHUFLW.
static bool isPSHUFHW_PSHUFLWMask(SDNode *N) {
  assert(N->getOpcode() == ISD::BUILD_VECTOR);

  if (N->getNumOperands() != 8)
    return false;

  // Lower quadword shuffled.
  for (unsigned i = 0; i != 4; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val > 4)
      return false;
  }

  // Upper quadword shuffled.
  for (unsigned i = 4; i != 8; ++i) {
    SDOperand Arg = N->getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) continue;
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val < 4 || Val > 7)
      return false;
  }

  return true;
}

/// CommuteVectorShuffle - Swap vector_shuffle operandsas well as
/// values in ther permute mask.
static SDOperand CommuteVectorShuffle(SDOperand Op, SDOperand &V1,
                                      SDOperand &V2, SDOperand &Mask,
                                      SelectionDAG &DAG) {
  MVT::ValueType VT = Op.getValueType();
  MVT::ValueType MaskVT = Mask.getValueType();
  MVT::ValueType EltVT = MVT::getVectorBaseType(MaskVT);
  unsigned NumElems = Mask.getNumOperands();
  std::vector<SDOperand> MaskVec;

  for (unsigned i = 0; i != NumElems; ++i) {
    SDOperand Arg = Mask.getOperand(i);
    if (Arg.getOpcode() == ISD::UNDEF) {
      MaskVec.push_back(DAG.getNode(ISD::UNDEF, EltVT));
      continue;
    }
    assert(isa<ConstantSDNode>(Arg) && "Invalid VECTOR_SHUFFLE mask!");
    unsigned Val = cast<ConstantSDNode>(Arg)->getValue();
    if (Val < NumElems)
      MaskVec.push_back(DAG.getConstant(Val + NumElems, EltVT));
    else
      MaskVec.push_back(DAG.getConstant(Val - NumElems, EltVT));
  }