SPUISelLowering.cpp

//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file implements the SPUTargetLowering class.
//
//===----------------------------------------------------------------------===//

#include "SPURegisterNames.h"
#include "SPUISelLowering.h"
#include "SPUTargetMachine.h"
#include "SPUFrameInfo.h"
#include "llvm/ADT/VectorExtras.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/Constants.h"
#include "llvm/Function.h"
#include "llvm/Intrinsics.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetOptions.h"

#include <map>

using namespace llvm;

// Used in getTargetNodeName() below
namespace {
  std::map<unsigned, const char *> node_names;

  //! MVT mapping to useful data for Cell SPU
  struct valtype_map_s {
    const MVT   valtype;
    const int   prefslot_byte;
  };

  const valtype_map_s valtype_map[] = {
    { MVT::i1,   3 },
    { MVT::i8,   3 },
    { MVT::i16,  2 },
    { MVT::i32,  0 },
    { MVT::f32,  0 },
    { MVT::i64,  0 },
    { MVT::f64,  0 },
    { MVT::i128, 0 }
  };

  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);

  const valtype_map_s *getValueTypeMapEntry(MVT VT) {
    const valtype_map_s *retval = 0;

    for (size_t i = 0; i < n_valtype_map; ++i) {
      if (valtype_map[i].valtype == VT) {
        retval = valtype_map + i;
        break;
      }
    }

#ifndef NDEBUG
    if (retval == 0) {
      cerr << "getValueTypeMapEntry returns NULL for "
           << VT.getMVTString()
           << "\n";
      abort();
    }
#endif

    return retval;
  }

  //! Predicate that returns true if operand is a memory target
  /*!
    \arg Op Operand to test
    \return true if the operand is a memory target (i.e., global
    address, external symbol, constant pool) or an A-form
    address.
   */
  bool isMemoryOperand(const SDValue &Op)
  {
    const unsigned Opc = Op.getOpcode();
    return (Opc == ISD::GlobalAddress
            || Opc == ISD::GlobalTLSAddress
            || Opc == ISD::JumpTable
            || Opc == ISD::ConstantPool
            || Opc == ISD::ExternalSymbol
            || Opc == ISD::TargetGlobalAddress
            || Opc == ISD::TargetGlobalTLSAddress
            || Opc == ISD::TargetJumpTable
            || Opc == ISD::TargetConstantPool
            || Opc == ISD::TargetExternalSymbol
            || Opc == SPUISD::AFormAddr);
  }

  //! Predicate that returns true if the operand is an indirect target
  bool isIndirectOperand(const SDValue &Op)
  {
    const unsigned Opc = Op.getOpcode();
    return (Opc == ISD::Register
            || Opc == SPUISD::LDRESULT);
  }
}

SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
  : TargetLowering(TM),
    SPUTM(TM)
{
  // Fold away setcc operations if possible.
  setPow2DivIsCheap();

  // Use _setjmp/_longjmp instead of setjmp/longjmp.
  setUseUnderscoreSetJmp(true);
  setUseUnderscoreLongJmp(true);

  // Set up the SPU's register classes:
  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);

  // Initialize libcalls:
  setLibcallName(RTLIB::MUL_I64, "__muldi3");

  // SPU has no sign or zero extended loads for i1, i8, i16:
  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);

  setLoadExtAction(ISD::EXTLOAD,  MVT::i8, Custom);
  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
  setTruncStoreAction(MVT::i8,    MVT::i8, Custom);
  setTruncStoreAction(MVT::i16,   MVT::i8, Custom);
  setTruncStoreAction(MVT::i32,   MVT::i8, Custom);
  setTruncStoreAction(MVT::i64,   MVT::i8, Custom);
  setTruncStoreAction(MVT::i128,  MVT::i8, Custom);

  setLoadExtAction(ISD::EXTLOAD,  MVT::i16, Custom);
  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);

  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Custom);

  // SPU constant load actions are custom lowered:
  setOperationAction(ISD::Constant,   MVT::i64, Custom);
  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);

  // SPU's loads and stores have to be custom lowered:
  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
       ++sctype) {
    MVT VT = (MVT::SimpleValueType)sctype;

    setOperationAction(ISD::LOAD, VT, Custom);
    setOperationAction(ISD::STORE, VT, Custom);
  }

  // Custom lower BRCOND for i8 to "promote" the result to i16
  setOperationAction(ISD::BRCOND, MVT::Other, Custom);

  // Expand the jumptable branches
  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);

  // Custom lower SELECT_CC for most cases, but expand by default
  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
  setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
  setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
  setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
#if 0
  setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
#endif

  // SPU has no intrinsics for these particular operations:
  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);

  // PowerPC has no SREM/UREM instructions
  setOperationAction(ISD::SREM, MVT::i32, Expand);
  setOperationAction(ISD::UREM, MVT::i32, Expand);
  setOperationAction(ISD::SREM, MVT::i64, Expand);
  setOperationAction(ISD::UREM, MVT::i64, Expand);

  // We don't support sin/cos/sqrt/fmod
  setOperationAction(ISD::FSIN , MVT::f64, Expand);
  setOperationAction(ISD::FCOS , MVT::f64, Expand);
  setOperationAction(ISD::FREM , MVT::f64, Expand);
  setOperationAction(ISD::FSIN , MVT::f32, Expand);
  setOperationAction(ISD::FCOS , MVT::f32, Expand);
  setOperationAction(ISD::FREM , MVT::f32, Expand);

  // If we're enabling GP optimizations, use hardware square root
  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
  setOperationAction(ISD::FSQRT, MVT::f32, Expand);

  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);

  // SPU can do rotate right and left, so legalize it... but customize for i8
  // because instructions don't exist.

  // FIXME: Change from "expand" to appropriate type once ROTR is supported in
  //        .td files.
  setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
  setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
  setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);

  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
  setOperationAction(ISD::ROTL, MVT::i8,     Custom);

  // SPU has no native version of shift left/right for i8
  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
  setOperationAction(ISD::SRA,  MVT::i8,     Custom);

  // SPU needs custom lowering for shift left/right for i64
  setOperationAction(ISD::SHL,  MVT::i64,    Custom);
  setOperationAction(ISD::SRL,  MVT::i64,    Custom);
  setOperationAction(ISD::SRA,  MVT::i64,    Custom);

  // Custom lower i8, i32 and i64 multiplications
  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
  setOperationAction(ISD::MUL,  MVT::i32,    Custom);
  setOperationAction(ISD::MUL,  MVT::i64,    Expand);   // libcall

  // SMUL_LOHI, UMUL_LOHI
  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Custom);
  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Custom);
  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Custom);
  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Custom);

  // Need to custom handle (some) common i8, i64 math ops
  setOperationAction(ISD::ADD,  MVT::i64,    Custom);
  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
  setOperationAction(ISD::SUB,  MVT::i64,    Custom);

  // SPU does not have BSWAP. It does have i32 support CTLZ.
  // CTPOP has to be custom lowered.
  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);

  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);

  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);

  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);

  // SPU has a version of select that implements (a&~c)|(b&c), just like
  // select ought to work:
  setOperationAction(ISD::SELECT, MVT::i8,   Legal);
  setOperationAction(ISD::SELECT, MVT::i16,  Legal);
  setOperationAction(ISD::SELECT, MVT::i32,  Legal);
  setOperationAction(ISD::SELECT, MVT::i64,  Expand);

  setOperationAction(ISD::SETCC, MVT::i8,    Legal);
  setOperationAction(ISD::SETCC, MVT::i16,   Legal);
  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
  setOperationAction(ISD::SETCC, MVT::i64,   Expand);

  // Zero extension and sign extension for i64 have to be
  // custom legalized
  setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
  setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
  setOperationAction(ISD::ANY_EXTEND,  MVT::i64, Custom);

  // Custom lower truncates
  setOperationAction(ISD::TRUNCATE, MVT::i8, Custom);
  setOperationAction(ISD::TRUNCATE, MVT::i16, Custom);
  setOperationAction(ISD::TRUNCATE, MVT::i32, Custom);
  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);

  // SPU has a legal FP -> signed INT instruction
  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);

  // FDIV on SPU requires custom lowering
  setOperationAction(ISD::FDIV, MVT::f32, Custom);
  //setOperationAction(ISD::FDIV, MVT::f64, Custom);

  // SPU has [U|S]INT_TO_FP
  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
  setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
  setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);

  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal);
  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal);
  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal);
  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal);

  // We cannot sextinreg(i1).  Expand to shifts.
  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);

  // Support label based line numbers.
  setOperationAction(ISD::DBG_STOPPOINT, MVT::Other, Expand);
  setOperationAction(ISD::DEBUG_LOC, MVT::Other, Expand);

  // We want to legalize GlobalAddress and ConstantPool nodes into the
  // appropriate instructions to materialize the address.
  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
       ++sctype) {
    MVT VT = (MVT::SimpleValueType)sctype;

    setOperationAction(ISD::GlobalAddress, VT, Custom);
    setOperationAction(ISD::ConstantPool,  VT, Custom);
    setOperationAction(ISD::JumpTable,     VT, Custom);
  }

  // RET must be custom lowered, to meet ABI requirements
  setOperationAction(ISD::RET,           MVT::Other, Custom);

  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
  setOperationAction(ISD::VASTART           , MVT::Other, Custom);

  // Use the default implementation.
  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);

  // Cell SPU has instructions for converting between i64 and fp.
  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);

  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);

  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);

  // First set operation action for all vector types to expand. Then we
  // will selectively turn on ones that can be effectively codegen'd.
  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);

  for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
    MVT VT = (MVT::SimpleValueType)i;

    // add/sub are legal for all supported vector VT's.
    setOperationAction(ISD::ADD , VT, Legal);
    setOperationAction(ISD::SUB , VT, Legal);
    // mul has to be custom lowered.
    setOperationAction(ISD::MUL , VT, Custom);

    setOperationAction(ISD::AND   , VT, Legal);
    setOperationAction(ISD::OR    , VT, Legal);
    setOperationAction(ISD::XOR   , VT, Legal);
    setOperationAction(ISD::LOAD  , VT, Legal);
    setOperationAction(ISD::SELECT, VT, Legal);
    setOperationAction(ISD::STORE,  VT, Legal);

    // These operations need to be expanded:
    setOperationAction(ISD::SDIV, VT, Expand);
    setOperationAction(ISD::SREM, VT, Expand);
    setOperationAction(ISD::UDIV, VT, Expand);
    setOperationAction(ISD::UREM, VT, Expand);
    setOperationAction(ISD::FDIV, VT, Custom);

    // Custom lower build_vector, constant pool spills, insert and
    // extract vector elements:
    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
    setOperationAction(ISD::ConstantPool, VT, Custom);
    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
  }

  setOperationAction(ISD::MUL, MVT::v16i8, Custom);
  setOperationAction(ISD::AND, MVT::v16i8, Custom);
  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);

  setShiftAmountType(MVT::i32);
  setBooleanContents(ZeroOrOneBooleanContent);

  setStackPointerRegisterToSaveRestore(SPU::R1);

  // We have target-specific dag combine patterns for the following nodes:
  setTargetDAGCombine(ISD::ADD);
  setTargetDAGCombine(ISD::ZERO_EXTEND);
  setTargetDAGCombine(ISD::SIGN_EXTEND);
  setTargetDAGCombine(ISD::ANY_EXTEND);

  computeRegisterProperties();

  // Set other properties:
  setSchedulingPreference(SchedulingForLatency);
}

const char *
SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
{
  if (node_names.empty()) {
    node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
    node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
    node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
    node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
    node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
    node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
    node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
    node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
    node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
    node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
    node_names[(unsigned) SPUISD::PROMOTE_SCALAR] = "SPUISD::PROMOTE_SCALAR";
    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
    node_names[(unsigned) SPUISD::MPY] = "SPUISD::MPY";
    node_names[(unsigned) SPUISD::MPYU] = "SPUISD::MPYU";
    node_names[(unsigned) SPUISD::MPYH] = "SPUISD::MPYH";
    node_names[(unsigned) SPUISD::MPYHH] = "SPUISD::MPYHH";
    node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
    node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
    node_names[(unsigned) SPUISD::VEC_SHL] = "SPUISD::VEC_SHL";
    node_names[(unsigned) SPUISD::VEC_SRL] = "SPUISD::VEC_SRL";
    node_names[(unsigned) SPUISD::VEC_SRA] = "SPUISD::VEC_SRA";
    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
    node_names[(unsigned) SPUISD::ROTQUAD_RZ_BYTES] =
      "SPUISD::ROTQUAD_RZ_BYTES";
    node_names[(unsigned) SPUISD::ROTQUAD_RZ_BITS] =
      "SPUISD::ROTQUAD_RZ_BITS";
    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
      "SPUISD::ROTBYTES_LEFT_BITS";
    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
    node_names[(unsigned) SPUISD::ADD_EXTENDED] = "SPUISD::ADD_EXTENDED";
    node_names[(unsigned) SPUISD::CARRY_GENERATE] = "SPUISD::CARRY_GENERATE";
    node_names[(unsigned) SPUISD::SUB_EXTENDED] = "SPUISD::SUB_EXTENDED";
    node_names[(unsigned) SPUISD::BORROW_GENERATE] = "SPUISD::BORROW_GENERATE";
    node_names[(unsigned) SPUISD::FPInterp] = "SPUISD::FPInterp";
    node_names[(unsigned) SPUISD::FPRecipEst] = "SPUISD::FPRecipEst";
    node_names[(unsigned) SPUISD::SEXT32TO64] = "SPUISD::SEXT32TO64";
  }

  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);

  return ((i != node_names.end()) ? i->second : 0);
}

MVT SPUTargetLowering::getSetCCResultType(const SDValue &Op) const {
  MVT VT = Op.getValueType();
  return (VT.isInteger() ? VT : MVT(MVT::i32));
}

//===----------------------------------------------------------------------===//
// Calling convention code:
//===----------------------------------------------------------------------===//

#include "SPUGenCallingConv.inc"

//===----------------------------------------------------------------------===//
//  LowerOperation implementation
//===----------------------------------------------------------------------===//

/// Aligned load common code for CellSPU
/*!
  \param[in] Op The SelectionDAG load or store operand
  \param[in] DAG The selection DAG
  \param[in] ST CellSPU subtarget information structure
  \param[in,out] alignment Caller initializes this to the load or store node's
  value from getAlignment(), may be updated while generating the aligned load
  \param[in,out] alignOffs Aligned offset; set by AlignedLoad to the aligned
  offset (divisible by 16, modulo 16 == 0)
  \param[in,out] prefSlotOffs Preferred slot offset; set by AlignedLoad to the
  offset of the preferred slot (modulo 16 != 0)
  \param[in,out] VT Caller initializes this value type to the the load or store
  node's loaded or stored value type; may be updated if an i1-extended load or
  store.
  \param[out] was16aligned true if the base pointer had 16-byte alignment,
  otherwise false. Can help to determine if the chunk needs to be rotated.

 Both load and store lowering load a block of data aligned on a 16-byte
 boundary. This is the common aligned load code shared between both.
 */
static SDValue
AlignedLoad(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST,
            LSBaseSDNode *LSN,
            unsigned &alignment, int &alignOffs, int &prefSlotOffs,
            MVT &VT, bool &was16aligned)
{
  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  const valtype_map_s *vtm = getValueTypeMapEntry(VT);
  SDValue basePtr = LSN->getBasePtr();
  SDValue chain = LSN->getChain();

  if (basePtr.getOpcode() == ISD::ADD) {
    SDValue Op1 = basePtr.getNode()->getOperand(1);

    if (Op1.getOpcode() == ISD::Constant
        || Op1.getOpcode() == ISD::TargetConstant) {
      const ConstantSDNode *CN = cast<ConstantSDNode>(basePtr.getOperand(1));

      alignOffs = (int) CN->getZExtValue();
      prefSlotOffs = (int) (alignOffs & 0xf);

      // Adjust the rotation amount to ensure that the final result ends up in
      // the preferred slot:
      prefSlotOffs -= vtm->prefslot_byte;
      basePtr = basePtr.getOperand(0);

      // Loading from memory, can we adjust alignment?
      if (basePtr.getOpcode() == SPUISD::AFormAddr) {
        SDValue APtr = basePtr.getOperand(0);
        if (APtr.getOpcode() == ISD::TargetGlobalAddress) {
          GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(APtr);
          alignment = GSDN->getGlobal()->getAlignment();
        }
      }
    } else {
      alignOffs = 0;
      prefSlotOffs = -vtm->prefslot_byte;
    }
  } else if (basePtr.getOpcode() == ISD::FrameIndex) {
    FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(basePtr);
    alignOffs = int(FIN->getIndex() * SPUFrameInfo::stackSlotSize());
    prefSlotOffs = (int) (alignOffs & 0xf);
    prefSlotOffs -= vtm->prefslot_byte;
    basePtr = DAG.getRegister(SPU::R1, VT);
  } else {
    alignOffs = 0;
    prefSlotOffs = -vtm->prefslot_byte;
  }

  if (alignment == 16) {
    // Realign the base pointer as a D-Form address:
    if (!isMemoryOperand(basePtr) || (alignOffs & ~0xf) != 0) {
      basePtr = DAG.getNode(ISD::ADD, PtrVT,
                            basePtr,
                            DAG.getConstant((alignOffs & ~0xf), PtrVT));
    }

    // Emit the vector load:
    was16aligned = true;
    return DAG.getLoad(MVT::v16i8, chain, basePtr,
                       LSN->getSrcValue(), LSN->getSrcValueOffset(),
                       LSN->isVolatile(), 16);
  }

  // Unaligned load or we're using the "large memory" model, which means that
  // we have to be very pessimistic:
  if (isMemoryOperand(basePtr) || isIndirectOperand(basePtr)) {
    basePtr = DAG.getNode(SPUISD::IndirectAddr, PtrVT, basePtr,
                          DAG.getConstant(0, PtrVT));
  }

  // Add the offset
  basePtr = DAG.getNode(ISD::ADD, PtrVT, basePtr,
                        DAG.getConstant((alignOffs & ~0xf), PtrVT));
  was16aligned = false;
  return DAG.getLoad(MVT::v16i8, chain, basePtr,
                     LSN->getSrcValue(), LSN->getSrcValueOffset(),
                     LSN->isVolatile(), 16);
}

/// Custom lower loads for CellSPU
/*!
 All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
 within a 16-byte block, we have to rotate to extract the requested element.

 For extending loads, we also want to ensure that the following sequence is
 emitted, e.g. for MVT::f32 extending load to MVT::f64:

\verbatim
%1  v16i8,ch = load 
%2  v16i8,ch = rotate %1
%3  v4f8, ch = bitconvert %2 
%4  f32      = vec2perfslot %3
%5  f64      = fp_extend %4
\endverbatim
*/
static SDValue
LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  LoadSDNode *LN = cast<LoadSDNode>(Op);
  SDValue the_chain = LN->getChain();
  MVT InVT = LN->getMemoryVT();
  MVT OutVT = Op.getValueType();
  ISD::LoadExtType ExtType = LN->getExtensionType();
  unsigned alignment = LN->getAlignment();
  SDValue Ops[8];

  switch (LN->getAddressingMode()) {
  case ISD::UNINDEXED: {
    int offset, rotamt;
    bool was16aligned;
    SDValue result =
      AlignedLoad(Op, DAG, ST, LN,alignment, offset, rotamt, InVT,
                  was16aligned);

    if (result.getNode() == 0)
      return result;

    the_chain = result.getValue(1);
    // Rotate the chunk if necessary
    if (rotamt < 0)
      rotamt += 16;
    if (rotamt != 0 || !was16aligned) {
      SDVTList vecvts = DAG.getVTList(MVT::v16i8, MVT::Other);

      Ops[0] = result;
      if (was16aligned) {
        Ops[1] = DAG.getConstant(rotamt, MVT::i16);
      } else {
        MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
        LoadSDNode *LN1 = cast<LoadSDNode>(result);
        Ops[1] = DAG.getNode(ISD::ADD, PtrVT, LN1->getBasePtr(),
                             DAG.getConstant(rotamt, PtrVT));
      }

      result = DAG.getNode(SPUISD::ROTBYTES_LEFT, MVT::v16i8, Ops, 2);
    }

    // Convert the loaded v16i8 vector to the appropriate vector type
    // specified by the operand:
    MVT vecVT = MVT::getVectorVT(InVT, (128 / InVT.getSizeInBits()));
    result = DAG.getNode(SPUISD::VEC2PREFSLOT, InVT,
	                 DAG.getNode(ISD::BIT_CONVERT, vecVT, result));

    // Handle extending loads by extending the scalar result:
    if (ExtType == ISD::SEXTLOAD) {
      result = DAG.getNode(ISD::SIGN_EXTEND, OutVT, result);
    } else if (ExtType == ISD::ZEXTLOAD) {
      result = DAG.getNode(ISD::ZERO_EXTEND, OutVT, result);
    } else if (ExtType == ISD::EXTLOAD) {
      unsigned NewOpc = ISD::ANY_EXTEND;

      if (OutVT.isFloatingPoint())
	NewOpc = ISD::FP_EXTEND;

      result = DAG.getNode(NewOpc, OutVT, result);
    }

    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
    SDValue retops[2] = {
      result,
      the_chain
    };

    result = DAG.getNode(SPUISD::LDRESULT, retvts,
                         retops, sizeof(retops) / sizeof(retops[0]));
    return result;
  }
  case ISD::PRE_INC:
  case ISD::PRE_DEC:
  case ISD::POST_INC:
  case ISD::POST_DEC:
  case ISD::LAST_INDEXED_MODE:
    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
            "UNINDEXED\n";
    cerr << (unsigned) LN->getAddressingMode() << "\n";
    abort();
    /*NOTREACHED*/
  }

  return SDValue();
}

/// Custom lower stores for CellSPU
/*!
 All CellSPU stores are aligned to 16-byte boundaries, so for elements
 within a 16-byte block, we have to generate a shuffle to insert the
 requested element into its place, then store the resulting block.
 */
static SDValue
LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  StoreSDNode *SN = cast<StoreSDNode>(Op);
  SDValue Value = SN->getValue();
  MVT VT = Value.getValueType();
  MVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
  unsigned alignment = SN->getAlignment();

  switch (SN->getAddressingMode()) {
  case ISD::UNINDEXED: {
    int chunk_offset, slot_offset;
    bool was16aligned;

    // The vector type we really want to load from the 16-byte chunk.
    MVT vecVT = MVT::getVectorVT(VT, (128 / VT.getSizeInBits())),
        stVecVT = MVT::getVectorVT(StVT, (128 / StVT.getSizeInBits()));

    SDValue alignLoadVec =
      AlignedLoad(Op, DAG, ST, SN, alignment,
                  chunk_offset, slot_offset, VT, was16aligned);

    if (alignLoadVec.getNode() == 0)
      return alignLoadVec;

    LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec);
    SDValue basePtr = LN->getBasePtr();
    SDValue the_chain = alignLoadVec.getValue(1);
    SDValue theValue = SN->getValue();
    SDValue result;

    if (StVT != VT
        && (theValue.getOpcode() == ISD::AssertZext
            || theValue.getOpcode() == ISD::AssertSext)) {
      // Drill down and get the value for zero- and sign-extended
      // quantities
      theValue = theValue.getOperand(0);
    }

    chunk_offset &= 0xf;

    SDValue insertEltOffs = DAG.getConstant(chunk_offset, PtrVT);
    SDValue insertEltPtr;

    // If the base pointer is already a D-form address, then just create
    // a new D-form address with a slot offset and the orignal base pointer.
    // Otherwise generate a D-form address with the slot offset relative
    // to the stack pointer, which is always aligned.
    DEBUG(cerr << "CellSPU LowerSTORE: basePtr = ");
    DEBUG(basePtr.getNode()->dump(&DAG));
    DEBUG(cerr << "\n");

    if (basePtr.getOpcode() == SPUISD::IndirectAddr ||
        (basePtr.getOpcode() == ISD::ADD
         && basePtr.getOperand(0).getOpcode() == SPUISD::IndirectAddr)) {
      insertEltPtr = basePtr;
    } else {
      insertEltPtr = DAG.getNode(ISD::ADD, PtrVT, basePtr, insertEltOffs);
    }

    SDValue insertEltOp =
            DAG.getNode(SPUISD::SHUFFLE_MASK, vecVT, insertEltPtr);
    SDValue vectorizeOp =
            DAG.getNode(ISD::SCALAR_TO_VECTOR, vecVT, theValue);

    result = DAG.getNode(SPUISD::SHUFB, vecVT,
			 vectorizeOp, alignLoadVec,
			 DAG.getNode(ISD::BIT_CONVERT, MVT::v4i32, insertEltOp));

    result = DAG.getStore(the_chain, result, basePtr,
                          LN->getSrcValue(), LN->getSrcValueOffset(),
                          LN->isVolatile(), LN->getAlignment());

#if 0 && defined(NDEBUG)
    if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
      const SDValue &currentRoot = DAG.getRoot();

      DAG.setRoot(result);
      cerr << "------- CellSPU:LowerStore result:\n";
      DAG.dump();
      cerr << "-------\n";
      DAG.setRoot(currentRoot);
    }
#endif

    return result;
    /*UNREACHED*/
  }
  case ISD::PRE_INC:
  case ISD::PRE_DEC:
  case ISD::POST_INC:
  case ISD::POST_DEC:
  case ISD::LAST_INDEXED_MODE:
    cerr << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
            "UNINDEXED\n";
    cerr << (unsigned) SN->getAddressingMode() << "\n";
    abort();
    /*NOTREACHED*/
  }

  return SDValue();
}

/// Generate the address of a constant pool entry.
static SDValue
LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  MVT PtrVT = Op.getValueType();
  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
  Constant *C = CP->getConstVal();
  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
  SDValue Zero = DAG.getConstant(0, PtrVT);
  const TargetMachine &TM = DAG.getTarget();

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      // Just return the SDValue with the constant pool address in it.
      return DAG.getNode(SPUISD::AFormAddr, PtrVT, CPI, Zero);
    } else {
      SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, CPI, Zero);
      SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, CPI, Zero);
      return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
    }
  }

  assert(0 &&
         "LowerConstantPool: Relocation model other than static"
         " not supported.");
  return SDValue();
}

static SDValue
LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  MVT PtrVT = Op.getValueType();
  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
  SDValue Zero = DAG.getConstant(0, PtrVT);
  const TargetMachine &TM = DAG.getTarget();

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      return DAG.getNode(SPUISD::AFormAddr, PtrVT, JTI, Zero);
    } else {
      SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, JTI, Zero);
      SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, JTI, Zero);
      return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
    }
  }

  assert(0 &&
         "LowerJumpTable: Relocation model other than static not supported.");
  return SDValue();
}

static SDValue
LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
  MVT PtrVT = Op.getValueType();
  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
  GlobalValue *GV = GSDN->getGlobal();
  SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
  const TargetMachine &TM = DAG.getTarget();
  SDValue Zero = DAG.getConstant(0, PtrVT);

  if (TM.getRelocationModel() == Reloc::Static) {
    if (!ST->usingLargeMem()) {
      return DAG.getNode(SPUISD::AFormAddr, PtrVT, GA, Zero);
    } else {
      SDValue Hi = DAG.getNode(SPUISD::Hi, PtrVT, GA, Zero);
      SDValue Lo = DAG.getNode(SPUISD::Lo, PtrVT, GA, Zero);
      return DAG.getNode(SPUISD::IndirectAddr, PtrVT, Hi, Lo);
    }
  } else {
    cerr << "LowerGlobalAddress: Relocation model other than static not "
         << "supported.\n";
    abort();
    /*NOTREACHED*/
  }

  return SDValue();
}

//! Custom lower i64 integer constants
/*!
 This code inserts all of the necessary juggling that needs to occur to load
 a 64-bit constant into a register.
 */
static SDValue
LowerConstant(SDValue Op, SelectionDAG &DAG) {
  MVT VT = Op.getValueType();

  if (VT == MVT::i64) {
    ConstantSDNode *CN = cast<ConstantSDNode>(Op.getNode());
    SDValue T = DAG.getConstant(CN->getZExtValue(), VT);
    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
                       DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T));
  } else {
    cerr << "LowerConstant: unhandled constant type "
         << VT.getMVTString()
         << "\n";
    abort();
    /*NOTREACHED*/
  }

  return SDValue();
}

//! Custom lower double precision floating point constants
static SDValue
LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
  MVT VT = Op.getValueType();

  if (VT == MVT::f64) {
    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());

    assert((FP != 0) &&
           "LowerConstantFP: Node is not ConstantFPSDNode");
    
    uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
    SDValue T = DAG.getConstant(dbits, MVT::i64);
    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, MVT::v2i64, T, T);
    return DAG.getNode(SPUISD::VEC2PREFSLOT, VT,
                       DAG.getNode(ISD::BIT_CONVERT, MVT::v2f64, Tvec));
  }

  return SDValue();
}

//! Lower MVT::i8 brcond to a promoted type (MVT::i32, MVT::i16)
static SDValue
LowerBRCOND(SDValue Op, SelectionDAG &DAG)
{
  SDValue Cond = Op.getOperand(1);
  MVT CondVT = Cond.getValueType();
  MVT CondNVT;

  if (CondVT == MVT::i8) {
    CondNVT = MVT::i16;
    return DAG.getNode(ISD::BRCOND, Op.getValueType(),
                      Op.getOperand(0),
                      DAG.getNode(ISD::ZERO_EXTEND, CondNVT, Op.getOperand(1)),
                      Op.getOperand(2));
  } else
    return SDValue();                // Unchanged
}

static SDValue
LowerFORMAL_ARGUMENTS(SDValue Op, SelectionDAG &DAG, int &VarArgsFrameIndex)
{
  MachineFunction &MF = DAG.getMachineFunction();
  MachineFrameInfo *MFI = MF.getFrameInfo();
  MachineRegisterInfo &RegInfo = MF.getRegInfo();
  SmallVector<SDValue, 48> ArgValues;
  SDValue Root = Op.getOperand(0);
  bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue() != 0;

  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();

  unsigned ArgOffset = SPUFrameInfo::minStackSize();
  unsigned ArgRegIdx = 0;
  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();

  MVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();

  // Add DAG nodes to load the arguments or copy them out of registers.
  for (unsigned ArgNo = 0, e = Op.getNode()->getNumValues() - 1;
       ArgNo != e; ++ArgNo) {
    MVT ObjectVT = Op.getValue(ArgNo).getValueType();
    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
    SDValue ArgVal;

    if (ArgRegIdx < NumArgRegs) {
      const TargetRegisterClass *ArgRegClass;

      switch (ObjectVT.getSimpleVT()) {
      default: {
        cerr << "LowerFORMAL_ARGUMENTS Unhandled argument type: "
             << ObjectVT.getMVTString()
             << "\n";
        abort();
      }
      case MVT::i8:
        ArgRegClass = &SPU::R8CRegClass;
        break;
      case MVT::i16:
        ArgRegClass = &SPU::R16CRegClass;
        break;
      case MVT::i32:
        ArgRegClass = &SPU::R32CRegClass;
        break;
      case MVT::i64:
        ArgRegClass = &SPU::R64CRegClass;
        break;
      case MVT::f32:
        ArgRegClass = &SPU::R32FPRegClass;
        break;
      case MVT::f64:
        ArgRegClass = &SPU::R64FPRegClass;
        break;
      case MVT::v2f64:
      case MVT::v4f32:
      case MVT::v2i64:
      case MVT::v4i32:
      case MVT::v8i16:
      case MVT::v16i8:
        ArgRegClass = &SPU::VECREGRegClass;