X86ISelLowering.cpp

  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
         "Unknown SINT_TO_FP to lower!");

  // These are really Legal; return the operand so the caller accepts it as
  // Legal.
  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
    return Op;
  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
      Subtarget->is64Bit()) {
    return Op;
  }

  DebugLoc dl = Op.getDebugLoc();
  unsigned Size = SrcVT.getSizeInBits()/8;
  MachineFunction &MF = DAG.getMachineFunction();
  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                               StackSlot,
                               MachinePointerInfo::getFixedStack(SSFI),
                               false, false, 0);
  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
}

SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                                     SDValue StackSlot,
                                     SelectionDAG &DAG) const {
  // Build the FILD
  DebugLoc DL = Op.getDebugLoc();
  SDVTList Tys;
  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
  if (useSSE)
    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
  else
    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

  unsigned ByteSize = SrcVT.getSizeInBits()/8;

  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
  MachineMemOperand *MMO;
  if (FI) {
    int SSFI = FI->getIndex();
    MMO =
      DAG.getMachineFunction()
      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                            MachineMemOperand::MOLoad, ByteSize, ByteSize);
  } else {
    MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
    StackSlot = StackSlot.getOperand(1);
  }
  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
  SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
                                           X86ISD::FILD, DL,
                                           Tys, Ops, array_lengthof(Ops),
                                           SrcVT, MMO);

  if (useSSE) {
    Chain = Result.getValue(1);
    SDValue InFlag = Result.getValue(2);

    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
    // shouldn't be necessary except that RFP cannot be live across
    // multiple blocks. When stackifier is fixed, they can be uncoupled.
    MachineFunction &MF = DAG.getMachineFunction();
    unsigned SSFISize = Op.getValueType().getSizeInBits()/8;
    int SSFI = MF.getFrameInfo()->CreateStackObject(SSFISize, SSFISize, false);
    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
    Tys = DAG.getVTList(MVT::Other);
    SDValue Ops[] = {
      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
    };
    MachineMemOperand *MMO =
      DAG.getMachineFunction()
      .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                            MachineMemOperand::MOStore, SSFISize, SSFISize);

    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
                                    Ops, array_lengthof(Ops),
                                    Op.getValueType(), MMO);
    Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
                         MachinePointerInfo::getFixedStack(SSFI),
                         false, false, 0);
  }

  return Result;
}

// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
                                               SelectionDAG &DAG) const {
  // This algorithm is not obvious. Here it is in C code, more or less:
  /*
    double uint64_to_double( uint32_t hi, uint32_t lo ) {
      static const __m128i exp = { 0x4330000045300000ULL, 0 };
      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };

      // Copy ints to xmm registers.
      __m128i xh = _mm_cvtsi32_si128( hi );
      __m128i xl = _mm_cvtsi32_si128( lo );

      // Combine into low half of a single xmm register.
      __m128i x = _mm_unpacklo_epi32( xh, xl );
      __m128d d;
      double sd;

      // Merge in appropriate exponents to give the integer bits the right
      // magnitude.
      x = _mm_unpacklo_epi32( x, exp );

      // Subtract away the biases to deal with the IEEE-754 double precision
      // implicit 1.
      d = _mm_sub_pd( (__m128d) x, bias );

      // All conversions up to here are exact. The correctly rounded result is
      // calculated using the current rounding mode using the following
      // horizontal add.
      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
                                // store doesn't really need to be here (except
                                // maybe to zero the other double)
      return sd;
    }
  */

  DebugLoc dl = Op.getDebugLoc();
  LLVMContext *Context = DAG.getContext();

  // Build some magic constants.
  std::vector<Constant*> CV0;
  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
  Constant *C0 = ConstantVector::get(CV0);
  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);

  std::vector<Constant*> CV1;
  CV1.push_back(
    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
  CV1.push_back(
    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
  Constant *C1 = ConstantVector::get(CV1);
  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);

  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                                        Op.getOperand(0),
                                        DAG.getIntPtrConstant(1)));
  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                                        Op.getOperand(0),
                                        DAG.getIntPtrConstant(0)));
  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
                              MachinePointerInfo::getConstantPool(),
                              false, false, 16);
  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
  SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2);
  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
                              MachinePointerInfo::getConstantPool(),
                              false, false, 16);
  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);

  // Add the halves; easiest way is to swap them into another reg first.
  int ShufMask[2] = { 1, -1 };
  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
                     DAG.getIntPtrConstant(0));
}

// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
                                               SelectionDAG &DAG) const {
  DebugLoc dl = Op.getDebugLoc();
  // FP constant to bias correct the final result.
  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
                                   MVT::f64);

  // Load the 32-bit value into an XMM register.
  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
                                         Op.getOperand(0),
                                         DAG.getIntPtrConstant(0)));

  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                     DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Load),
                     DAG.getIntPtrConstant(0));

  // Or the load with the bias.
  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                   MVT::v2f64, Load)),
                           DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
                                                   MVT::v2f64, Bias)));
  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                   DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or),
                   DAG.getIntPtrConstant(0));

  // Subtract the bias.
  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

  // Handle final rounding.
  EVT DestVT = Op.getValueType();

  if (DestVT.bitsLT(MVT::f64)) {
    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
                       DAG.getIntPtrConstant(0));
  } else if (DestVT.bitsGT(MVT::f64)) {
    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
  }

  // Handle final rounding.
  return Sub;
}

SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                           SelectionDAG &DAG) const {
  SDValue N0 = Op.getOperand(0);
  DebugLoc dl = Op.getDebugLoc();

  // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
  // the optimization here.
  if (DAG.SignBitIsZero(N0))
    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);

  EVT SrcVT = N0.getValueType();
  EVT DstVT = Op.getValueType();
  if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
    return LowerUINT_TO_FP_i64(Op, DAG);
  else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
    return LowerUINT_TO_FP_i32(Op, DAG);

  // Make a 64-bit buffer, and use it to build an FILD.
  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
  if (SrcVT == MVT::i32) {
    SDValue WordOff = DAG.getConstant(4, getPointerTy());
    SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
                                     getPointerTy(), StackSlot, WordOff);
    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                  StackSlot, MachinePointerInfo(),
                                  false, false, 0);
    SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
                                  OffsetSlot, MachinePointerInfo(),
                                  false, false, 0);
    SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
    return Fild;
  }

  assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
                                StackSlot, MachinePointerInfo(),
                               false, false, 0);
  // For i64 source, we need to add the appropriate power of 2 if the input
  // was negative.  This is the same as the optimization in
  // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
  // we must be careful to do the computation in x87 extended precision, not
  // in SSE. (The generic code can't know it's OK to do this, or how to.)
  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
  MachineMemOperand *MMO =
    DAG.getMachineFunction()
    .getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                          MachineMemOperand::MOLoad, 8, 8);

  SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
  SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, 3,
                                         MVT::i64, MMO);

  APInt FF(32, 0x5F800000ULL);

  // Check whether the sign bit is set.
  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
                                 Op.getOperand(0), DAG.getConstant(0, MVT::i64),
                                 ISD::SETLT);

  // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
  SDValue FudgePtr = DAG.getConstantPool(
                             ConstantInt::get(*DAG.getContext(), FF.zext(64)),
                                         getPointerTy());

  // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
  SDValue Zero = DAG.getIntPtrConstant(0);
  SDValue Four = DAG.getIntPtrConstant(4);
  SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
                               Zero, Four);
  FudgePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(), FudgePtr, Offset);

  // Load the value out, extending it from f32 to f80.
  // FIXME: Avoid the extend by constructing the right constant pool?
  SDValue Fudge = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(),
                                 FudgePtr, MachinePointerInfo::getConstantPool(),
                                 MVT::f32, false, false, 4);
  // Extend everything to 80 bits to force it to be done on x87.
  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
  return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
}

std::pair<SDValue,SDValue> X86TargetLowering::
FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) const {
  DebugLoc DL = Op.getDebugLoc();

  EVT DstTy = Op.getValueType();

  if (!IsSigned) {
    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
    DstTy = MVT::i64;
  }

  assert(DstTy.getSimpleVT() <= MVT::i64 &&
         DstTy.getSimpleVT() >= MVT::i16 &&
         "Unknown FP_TO_SINT to lower!");

  // These are really Legal.
  if (DstTy == MVT::i32 &&
      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
    return std::make_pair(SDValue(), SDValue());
  if (Subtarget->is64Bit() &&
      DstTy == MVT::i64 &&
      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
    return std::make_pair(SDValue(), SDValue());

  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
  // stack slot.
  MachineFunction &MF = DAG.getMachineFunction();
  unsigned MemSize = DstTy.getSizeInBits()/8;
  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());


  unsigned Opc;
  switch (DstTy.getSimpleVT().SimpleTy) {
  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
  }

  SDValue Chain = DAG.getEntryNode();
  SDValue Value = Op.getOperand(0);
  EVT TheVT = Op.getOperand(0).getValueType();
  if (isScalarFPTypeInSSEReg(TheVT)) {
    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
    Chain = DAG.getStore(Chain, DL, Value, StackSlot,
                         MachinePointerInfo::getFixedStack(SSFI),
                         false, false, 0);
    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
    SDValue Ops[] = {
      Chain, StackSlot, DAG.getValueType(TheVT)
    };

    MachineMemOperand *MMO =
      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                              MachineMemOperand::MOLoad, MemSize, MemSize);
    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, 3,
                                    DstTy, MMO);
    Chain = Value.getValue(1);
    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
  }

  MachineMemOperand *MMO =
    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                            MachineMemOperand::MOStore, MemSize, MemSize);

  // Build the FP_TO_INT*_IN_MEM
  SDValue Ops[] = { Chain, Value, StackSlot };
  SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
                                         Ops, 3, DstTy, MMO);

  return std::make_pair(FIST, StackSlot);
}

SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                           SelectionDAG &DAG) const {
  if (Op.getValueType().isVector())
    return SDValue();

  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
  SDValue FIST = Vals.first, StackSlot = Vals.second;
  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
  if (FIST.getNode() == 0) return Op;

  // Load the result.
  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
                     FIST, StackSlot, MachinePointerInfo(), false, false, 0);
}

SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
                                           SelectionDAG &DAG) const {
  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
  SDValue FIST = Vals.first, StackSlot = Vals.second;
  assert(FIST.getNode() && "Unexpected failure");

  // Load the result.
  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
                     FIST, StackSlot, MachinePointerInfo(), false, false, 0);
}

SDValue X86TargetLowering::LowerFABS(SDValue Op,
                                     SelectionDAG &DAG) const {
  LLVMContext *Context = DAG.getContext();
  DebugLoc dl = Op.getDebugLoc();
  EVT VT = Op.getValueType();
  EVT EltVT = VT;
  if (VT.isVector())
    EltVT = VT.getVectorElementType();
  std::vector<Constant*> CV;
  if (EltVT == MVT::f64) {
    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
    CV.push_back(C);
    CV.push_back(C);
  } else {
    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
    CV.push_back(C);
    CV.push_back(C);
    CV.push_back(C);
    CV.push_back(C);
  }
  Constant *C = ConstantVector::get(CV);
  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                             MachinePointerInfo::getConstantPool(),
                             false, false, 16);
  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
}

SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
  LLVMContext *Context = DAG.getContext();
  DebugLoc dl = Op.getDebugLoc();
  EVT VT = Op.getValueType();
  EVT EltVT = VT;
  if (VT.isVector())
    EltVT = VT.getVectorElementType();
  std::vector<Constant*> CV;
  if (EltVT == MVT::f64) {
    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
    CV.push_back(C);
    CV.push_back(C);
  } else {
    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
    CV.push_back(C);
    CV.push_back(C);
    CV.push_back(C);
    CV.push_back(C);
  }
  Constant *C = ConstantVector::get(CV);
  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                             MachinePointerInfo::getConstantPool(),
                             false, false, 16);
  if (VT.isVector()) {
    return DAG.getNode(ISD::BITCAST, dl, VT,
                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
                                Op.getOperand(0)),
                    DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Mask)));
  } else {
    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
  }
}

SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
  LLVMContext *Context = DAG.getContext();
  SDValue Op0 = Op.getOperand(0);
  SDValue Op1 = Op.getOperand(1);
  DebugLoc dl = Op.getDebugLoc();
  EVT VT = Op.getValueType();
  EVT SrcVT = Op1.getValueType();

  // If second operand is smaller, extend it first.
  if (SrcVT.bitsLT(VT)) {
    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
    SrcVT = VT;
  }
  // And if it is bigger, shrink it first.
  if (SrcVT.bitsGT(VT)) {
    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
    SrcVT = VT;
  }

  // At this point the operands and the result should have the same
  // type, and that won't be f80 since that is not custom lowered.

  // First get the sign bit of second operand.
  std::vector<Constant*> CV;
  if (SrcVT == MVT::f64) {
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
  } else {
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
  }
  Constant *C = ConstantVector::get(CV);
  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
                              false, false, 16);
  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);

  // Shift sign bit right or left if the two operands have different types.
  if (SrcVT.bitsGT(VT)) {
    // Op0 is MVT::f32, Op1 is MVT::f64.
    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
                          DAG.getConstant(32, MVT::i32));
    SignBit = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, SignBit);
    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
                          DAG.getIntPtrConstant(0));
  }

  // Clear first operand sign bit.
  CV.clear();
  if (VT == MVT::f64) {
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
  } else {
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
  }
  C = ConstantVector::get(CV);
  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
                              false, false, 16);
  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);

  // Or the value with the sign bit.
  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
}

SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
  SDValue N0 = Op.getOperand(0);
  DebugLoc dl = Op.getDebugLoc();
  EVT VT = Op.getValueType();

  // Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
  SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
                                  DAG.getConstant(1, VT));
  return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
}

/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
                                    SelectionDAG &DAG) const {
  DebugLoc dl = Op.getDebugLoc();

  // CF and OF aren't always set the way we want. Determine which
  // of these we need.
  bool NeedCF = false;
  bool NeedOF = false;
  switch (X86CC) {
  default: break;
  case X86::COND_A: case X86::COND_AE:
  case X86::COND_B: case X86::COND_BE:
    NeedCF = true;
    break;
  case X86::COND_G: case X86::COND_GE:
  case X86::COND_L: case X86::COND_LE:
  case X86::COND_O: case X86::COND_NO:
    NeedOF = true;
    break;
  }

  // See if we can use the EFLAGS value from the operand instead of
  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
  // we prove that the arithmetic won't overflow, we can't use OF or CF.
  if (Op.getResNo() != 0 || NeedOF || NeedCF)
    // Emit a CMP with 0, which is the TEST pattern.
    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                       DAG.getConstant(0, Op.getValueType()));

  unsigned Opcode = 0;
  unsigned NumOperands = 0;
  switch (Op.getNode()->getOpcode()) {
  case ISD::ADD:
    // Due to an isel shortcoming, be conservative if this add is likely to be
    // selected as part of a load-modify-store instruction. When the root node
    // in a match is a store, isel doesn't know how to remap non-chain non-flag
    // uses of other nodes in the match, such as the ADD in this case. This
    // leads to the ADD being left around and reselected, with the result being
    // two adds in the output.  Alas, even if none our users are stores, that
    // doesn't prove we're O.K.  Ergo, if we have any parents that aren't
    // CopyToReg or SETCC, eschew INC/DEC.  A better fix seems to require
    // climbing the DAG back to the root, and it doesn't seem to be worth the
    // effort.
    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
           UE = Op.getNode()->use_end(); UI != UE; ++UI)
      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
        goto default_case;

    if (ConstantSDNode *C =
        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
      // An add of one will be selected as an INC.
      if (C->getAPIntValue() == 1) {
        Opcode = X86ISD::INC;
        NumOperands = 1;
        break;
      }

      // An add of negative one (subtract of one) will be selected as a DEC.
      if (C->getAPIntValue().isAllOnesValue()) {
        Opcode = X86ISD::DEC;
        NumOperands = 1;
        break;
      }
    }

    // Otherwise use a regular EFLAGS-setting add.
    Opcode = X86ISD::ADD;
    NumOperands = 2;
    break;
  case ISD::AND: {
    // If the primary and result isn't used, don't bother using X86ISD::AND,
    // because a TEST instruction will be better.
    bool NonFlagUse = false;
    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
      SDNode *User = *UI;
      unsigned UOpNo = UI.getOperandNo();
      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
        // Look pass truncate.
        UOpNo = User->use_begin().getOperandNo();
        User = *User->use_begin();
      }

      if (User->getOpcode() != ISD::BRCOND &&
          User->getOpcode() != ISD::SETCC &&
          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
        NonFlagUse = true;
        break;
      }
    }

    if (!NonFlagUse)
      break;
  }
    // FALL THROUGH
  case ISD::SUB:
  case ISD::OR:
  case ISD::XOR:
    // Due to the ISEL shortcoming noted above, be conservative if this op is
    // likely to be selected as part of a load-modify-store instruction.
    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
           UE = Op.getNode()->use_end(); UI != UE; ++UI)
      if (UI->getOpcode() == ISD::STORE)
        goto default_case;

    // Otherwise use a regular EFLAGS-setting instruction.
    switch (Op.getNode()->getOpcode()) {
    default: llvm_unreachable("unexpected operator!");
    case ISD::SUB: Opcode = X86ISD::SUB; break;
    case ISD::OR:  Opcode = X86ISD::OR;  break;
    case ISD::XOR: Opcode = X86ISD::XOR; break;
    case ISD::AND: Opcode = X86ISD::AND; break;
    }

    NumOperands = 2;
    break;
  case X86ISD::ADD:
  case X86ISD::SUB:
  case X86ISD::INC:
  case X86ISD::DEC:
  case X86ISD::OR:
  case X86ISD::XOR:
  case X86ISD::AND:
    return SDValue(Op.getNode(), 1);
  default:
  default_case:
    break;
  }

  if (Opcode == 0)
    // Emit a CMP with 0, which is the TEST pattern.
    return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                       DAG.getConstant(0, Op.getValueType()));

  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
  SmallVector<SDValue, 4> Ops;
  for (unsigned i = 0; i != NumOperands; ++i)
    Ops.push_back(Op.getOperand(i));

  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
  DAG.ReplaceAllUsesWith(Op, New);
  return SDValue(New.getNode(), 1);
}

/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
                                   SelectionDAG &DAG) const {
  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
    if (C->getAPIntValue() == 0)
      return EmitTest(Op0, X86CC, DAG);

  DebugLoc dl = Op0.getDebugLoc();
  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
}

/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
/// if it's possible.
SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
                                     DebugLoc dl, SelectionDAG &DAG) const {
  SDValue Op0 = And.getOperand(0);
  SDValue Op1 = And.getOperand(1);
  if (Op0.getOpcode() == ISD::TRUNCATE)
    Op0 = Op0.getOperand(0);
  if (Op1.getOpcode() == ISD::TRUNCATE)
    Op1 = Op1.getOperand(0);

  SDValue LHS, RHS;
  if (Op1.getOpcode() == ISD::SHL)
    std::swap(Op0, Op1);
  if (Op0.getOpcode() == ISD::SHL) {
    if (ConstantSDNode *And00C = dyn_cast<ConstantSDNode>(Op0.getOperand(0)))
      if (And00C->getZExtValue() == 1) {
        // If we looked past a truncate, check that it's only truncating away
        // known zeros.
        unsigned BitWidth = Op0.getValueSizeInBits();
        unsigned AndBitWidth = And.getValueSizeInBits();
        if (BitWidth > AndBitWidth) {
          APInt Mask = APInt::getAllOnesValue(BitWidth), Zeros, Ones;
          DAG.ComputeMaskedBits(Op0, Mask, Zeros, Ones);
          if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
            return SDValue();
        }
        LHS = Op1;
        RHS = Op0.getOperand(1);
      }
  } else if (Op1.getOpcode() == ISD::Constant) {
    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
    SDValue AndLHS = Op0;
    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
      LHS = AndLHS.getOperand(0);
      RHS = AndLHS.getOperand(1);
    }
  }

  if (LHS.getNode()) {
    // If LHS is i8, promote it to i32 with any_extend.  There is no i8 BT
    // instruction.  Since the shift amount is in-range-or-undefined, we know
    // that doing a bittest on the i32 value is ok.  We extend to i32 because
    // the encoding for the i16 version is larger than the i32 version.
    // Also promote i16 to i32 for performance / code size reason.
    if (LHS.getValueType() == MVT::i8 ||
        LHS.getValueType() == MVT::i16)
      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);

    // If the operand types disagree, extend the shift amount to match.  Since
    // BT ignores high bits (like shifts) we can use anyextend.
    if (LHS.getValueType() != RHS.getValueType())
      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);

    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                       DAG.getConstant(Cond, MVT::i8), BT);
  }

  return SDValue();
}

SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
  SDValue Op0 = Op.getOperand(0);
  SDValue Op1 = Op.getOperand(1);
  DebugLoc dl = Op.getDebugLoc();
  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

  // Optimize to BT if possible.
  // Lower (X & (1 << N)) == 0 to BT(X, N).
  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
  if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
      Op1.getOpcode() == ISD::Constant &&
      cast<ConstantSDNode>(Op1)->isNullValue() &&
      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
    if (NewSetCC.getNode())
      return NewSetCC;
  }

  // Look for X == 0, X == 1, X != 0, or X != 1.  We can simplify some forms of
  // these.
  if (Op1.getOpcode() == ISD::Constant &&
      (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
       cast<ConstantSDNode>(Op1)->isNullValue()) &&
      (CC == ISD::SETEQ || CC == ISD::SETNE)) {

    // If the input is a setcc, then reuse the input setcc or use a new one with
    // the inverted condition.
    if (Op0.getOpcode() == X86ISD::SETCC) {
      X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
      bool Invert = (CC == ISD::SETNE) ^
        cast<ConstantSDNode>(Op1)->isNullValue();
      if (!Invert) return Op0;

      CCode = X86::GetOppositeBranchCondition(CCode);
      return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                         DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
    }
  }

  bool isFP = Op1.getValueType().isFloatingPoint();
  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
  if (X86CC == X86::COND_INVALID)
    return SDValue();

  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                     DAG.getConstant(X86CC, MVT::i8), EFLAGS);
}

SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
  SDValue Cond;
  SDValue Op0 = Op.getOperand(0);
  SDValue Op1 = Op.getOperand(1);
  SDValue CC = Op.getOperand(2);
  EVT VT = Op.getValueType();
  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
  DebugLoc dl = Op.getDebugLoc();

  if (isFP) {
    unsigned SSECC = 8;
    EVT VT0 = Op0.getValueType();
    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
    bool Swap = false;

    switch (SetCCOpcode) {
    default: break;
    case ISD::SETOEQ:
    case ISD::SETEQ:  SSECC = 0; break;
    case ISD::SETOGT:
    case ISD::SETGT: Swap = true; // Fallthrough
    case ISD::SETLT:
    case ISD::SETOLT: SSECC = 1; break;
    case ISD::SETOGE:
    case ISD::SETGE: Swap = true; // Fallthrough
    case ISD::SETLE:
    case ISD::SETOLE: SSECC = 2; break;
    case ISD::SETUO:  SSECC = 3; break;
    case ISD::SETUNE:
    case ISD::SETNE:  SSECC = 4; break;
    case ISD::SETULE: Swap = true;
    case ISD::SETUGE: SSECC = 5; break;
    case ISD::SETULT: Swap = true;
    case ISD::SETUGT: SSECC = 6; break;
    case ISD::SETO:   SSECC = 7; break;
    }
    if (Swap)
      std::swap(Op0, Op1);

    // In the two special cases we can't handle, emit two comparisons.
    if (SSECC == 8) {
      if (SetCCOpcode == ISD::SETUEQ) {
        SDValue UNORD, EQ;
        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
      }
      else if (SetCCOpcode == ISD::SETONE) {
        SDValue ORD, NEQ;
        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
      }
      llvm_unreachable("Illegal FP comparison");
    }
    // Handle all other FP comparisons here.
    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
  }

  // We are handling one of the integer comparisons here.  Since SSE only has
  // GT and EQ comparisons for integer, swapping operands and multiple
  // operations may be required for some comparisons.
  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
  bool Swap = false, Invert = false, FlipSigns = false;

  switch (VT.getSimpleVT().SimpleTy) {
  default: break;
  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
  }

  switch (SetCCOpcode) {
  default: break;
  case ISD::SETNE:  Invert = true;
  case ISD::SETEQ:  Opc = EQOpc; break;
  case ISD::SETLT:  Swap = true;
  case ISD::SETGT:  Opc = GTOpc; break;
  case ISD::SETGE:  Swap = true;
  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
  case ISD::SETULT: Swap = true;
  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
  case ISD::SETUGE: Swap = true;
  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
  }
  if (Swap)
    std::swap(Op0, Op1);

  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
  // bits of the inputs before performing those operations.
  if (FlipSigns) {
    EVT EltVT = VT.getVectorElementType();
    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
                                      EltVT);
    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
                                    SignBits.size());
    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
  }

  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

  // If the logical-not of the result is required, perform that now.
  if (Invert)
    Result = DAG.getNOT(dl, Result, VT);

  return Result;
}

// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
  unsigned Opc = Op.getNode()->getOpcode();
  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
    return true;
  if (Op.getResNo() == 1 &&
      (Opc == X86ISD::ADD ||
       Opc == X86ISD::SUB ||
       Opc == X86ISD::ADC ||
       Opc == X86ISD::SBB ||
       Opc == X86ISD::SMUL ||
       Opc == X86ISD::UMUL ||
       Opc == X86ISD::INC ||
       Opc == X86ISD::DEC ||
       Opc == X86ISD::OR ||
       Opc == X86ISD::XOR ||
       Opc == X86ISD::AND))
    return true;

  if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
    return true;

  return false;
}

static bool isZero(SDValue V) {
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
  return C && C->isNullValue();
}

static bool isAllOnes(SDValue V) {
  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
  return C && C->isAllOnesValue();
}

SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
  bool addTest = true;
  SDValue Cond  = Op.getOperand(0);
  SDValue Op1 = Op.getOperand(1);
  SDValue Op2 = Op.getOperand(2);
  DebugLoc DL = Op.getDebugLoc();
  SDValue CC;

  if (Cond.getOpcode() == ISD::SETCC) {
    SDValue NewCond = LowerSETCC(Cond, DAG);
    if (NewCond.getNode())
      Cond = NewCond;
  }

  // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
  // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
  // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
  // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
  if (Cond.getOpcode() == X86ISD::SETCC &&
      Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
      isZero(Cond.getOperand(1).getOperand(1))) {
    SDValue Cmp = Cond.getOperand(1);

    unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

    if ((isAllOnes(Op1) || isAllOnes(Op2)) &&
        (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
      SDValue Y = isAllOnes(Op2) ? Op1 : Op2;