X86InstrSSE.td

//===-- X86InstrSSE.td - SSE Instruction Set ---------------*- tablegen -*-===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file describes the X86 SSE instruction set, defining the instructions,
// and properties of the instructions which are needed for code generation,
// machine code emission, and analysis.
//
//===----------------------------------------------------------------------===//

class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
  InstrItinClass rr = arg_rr;
  InstrItinClass rm = arg_rm;
}

class SizeItins<OpndItins arg_s, OpndItins arg_d> {
  OpndItins s = arg_s;
  OpndItins d = arg_d;
}


class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
  InstrItinClass arg_ri> {
  InstrItinClass rr = arg_rr;
  InstrItinClass rm = arg_rm;
  InstrItinClass ri = arg_ri;
}


// scalar
def SSE_ALU_F32S : OpndItins<
  IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
>;

def SSE_ALU_F64S : OpndItins<
  IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
>;

def SSE_ALU_ITINS_S : SizeItins<
  SSE_ALU_F32S, SSE_ALU_F64S
>;

def SSE_MUL_F32S : OpndItins<
  IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
>;

def SSE_MUL_F64S : OpndItins<
  IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
>;

def SSE_MUL_ITINS_S : SizeItins<
  SSE_MUL_F32S, SSE_MUL_F64S
>;

def SSE_DIV_F32S : OpndItins<
  IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
>;

def SSE_DIV_F64S : OpndItins<
  IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
>;

def SSE_DIV_ITINS_S : SizeItins<
  SSE_DIV_F32S, SSE_DIV_F64S
>;

// parallel
def SSE_ALU_F32P : OpndItins<
  IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
>;

def SSE_ALU_F64P : OpndItins<
  IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
>;

def SSE_ALU_ITINS_P : SizeItins<
  SSE_ALU_F32P, SSE_ALU_F64P
>;

def SSE_MUL_F32P : OpndItins<
  IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
>;

def SSE_MUL_F64P : OpndItins<
  IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
>;

def SSE_MUL_ITINS_P : SizeItins<
  SSE_MUL_F32P, SSE_MUL_F64P
>;

def SSE_DIV_F32P : OpndItins<
  IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
>;

def SSE_DIV_F64P : OpndItins<
  IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
>;

def SSE_DIV_ITINS_P : SizeItins<
  SSE_DIV_F32P, SSE_DIV_F64P
>;

def SSE_BIT_ITINS_P : OpndItins<
  IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
>;

def SSE_INTALU_ITINS_P : OpndItins<
  IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
>;

def SSE_INTALUQ_ITINS_P : OpndItins<
  IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
>;

def SSE_INTMUL_ITINS_P : OpndItins<
  IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
>;

def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
  IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
>;

def SSE_MOVA_ITINS : OpndItins<
  IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
>;

def SSE_MOVU_ITINS : OpndItins<
  IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
>;

//===----------------------------------------------------------------------===//
// SSE 1 & 2 Instructions Classes
//===----------------------------------------------------------------------===//

/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           RegisterClass RC, X86MemOperand x86memop,
                           OpndItins itins,
                           bit Is2Addr = 1> {
  let isCommutable = 1 in {
    def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr>;
  }
  def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm>;
}

/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
                             string asm, string SSEVer, string FPSizeStr,
                             Operand memopr, ComplexPattern mem_cpat,
                             OpndItins itins,
                             bit Is2Addr = 1> {
  def rr_Int : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
       !if(Is2Addr,
           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (!cast<Intrinsic>(
                 !strconcat("int_x86_sse", SSEVer, "_", OpcodeStr, FPSizeStr))
             RC:$src1, RC:$src2))], itins.rr>;
  def rm_Int : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
       !if(Is2Addr,
           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (!cast<Intrinsic>(!strconcat("int_x86_sse",
                                          SSEVer, "_", OpcodeStr, FPSizeStr))
             RC:$src1, mem_cpat:$src2))], itins.rm>;
}

/// sse12_fp_packed - SSE 1 & 2 packed instructions class
multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           RegisterClass RC, ValueType vt,
                           X86MemOperand x86memop, PatFrag mem_frag,
                           Domain d, OpndItins itins, bit Is2Addr = 1> {
  let isCommutable = 1 in
    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>;
  let mayLoad = 1 in
    def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
          itins.rm, d>;
}

/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
                                      string OpcodeStr, X86MemOperand x86memop,
                                      list<dag> pat_rr, list<dag> pat_rm,
                                      bit Is2Addr = 1,
                                      bit rr_hasSideEffects = 0> {
  let isCommutable = 1, neverHasSideEffects = rr_hasSideEffects in
    def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       pat_rr, IIC_DEFAULT, d>;
  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
       !if(Is2Addr,
           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       pat_rm, IIC_DEFAULT, d>;
}

/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class
multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
                           string asm, string SSEVer, string FPSizeStr,
                           X86MemOperand x86memop, PatFrag mem_frag,
                           Domain d, OpndItins itins, bit Is2Addr = 1> {
  def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
       !if(Is2Addr,
           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
           [(set RC:$dst, (!cast<Intrinsic>(
                     !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
                 RC:$src1, RC:$src2))], IIC_DEFAULT, d>;
  def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2),
       !if(Is2Addr,
           !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
           !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
       [(set RC:$dst, (!cast<Intrinsic>(
                     !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
             RC:$src1, (mem_frag addr:$src2)))], IIC_DEFAULT, d>;
}

//===----------------------------------------------------------------------===//
//  Non-instruction patterns
//===----------------------------------------------------------------------===//

// A vector extract of the first f32/f64 position is a subregister copy
def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
          (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
          (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;

// A 128-bit subvector extract from the first 256-bit vector position
// is a subregister copy that needs no instruction.
def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
          (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
          (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;

def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
          (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
          (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;

def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
          (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
          (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;

// A 128-bit subvector insert to the first 256-bit vector position
// is a subregister copy that needs no instruction.
let AddedComplexity = 25 in { // to give priority over vinsertf128rm
def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
          (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
          (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
          (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
          (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
          (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
          (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
}

// Implicitly promote a 32-bit scalar to a vector.
def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
          (COPY_TO_REGCLASS FR32:$src, VR128)>;
def : Pat<(v8f32 (scalar_to_vector FR32:$src)),
          (COPY_TO_REGCLASS FR32:$src, VR128)>;
// Implicitly promote a 64-bit scalar to a vector.
def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
          (COPY_TO_REGCLASS FR64:$src, VR128)>;
def : Pat<(v4f64 (scalar_to_vector FR64:$src)),
          (COPY_TO_REGCLASS FR64:$src, VR128)>;

// Bitcasts between 128-bit vector types. Return the original type since
// no instruction is needed for the conversion
let Predicates = [HasSSE2] in {
  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
}

// Bitcasts between 256-bit vector types. Return the original type since
// no instruction is needed for the conversion
let Predicates = [HasAVX] in {
  def : Pat<(v4f64  (bitconvert (v8f32 VR256:$src))),  (v4f64 VR256:$src)>;
  def : Pat<(v4f64  (bitconvert (v8i32 VR256:$src))),  (v4f64 VR256:$src)>;
  def : Pat<(v4f64  (bitconvert (v4i64 VR256:$src))),  (v4f64 VR256:$src)>;
  def : Pat<(v4f64  (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
  def : Pat<(v4f64  (bitconvert (v32i8 VR256:$src))),  (v4f64 VR256:$src)>;
  def : Pat<(v8f32  (bitconvert (v8i32 VR256:$src))),  (v8f32 VR256:$src)>;
  def : Pat<(v8f32  (bitconvert (v4i64 VR256:$src))),  (v8f32 VR256:$src)>;
  def : Pat<(v8f32  (bitconvert (v4f64 VR256:$src))),  (v8f32 VR256:$src)>;
  def : Pat<(v8f32  (bitconvert (v32i8 VR256:$src))),  (v8f32 VR256:$src)>;
  def : Pat<(v8f32  (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
  def : Pat<(v4i64  (bitconvert (v8f32 VR256:$src))),  (v4i64 VR256:$src)>;
  def : Pat<(v4i64  (bitconvert (v8i32 VR256:$src))),  (v4i64 VR256:$src)>;
  def : Pat<(v4i64  (bitconvert (v4f64 VR256:$src))),  (v4i64 VR256:$src)>;
  def : Pat<(v4i64  (bitconvert (v32i8 VR256:$src))),  (v4i64 VR256:$src)>;
  def : Pat<(v4i64  (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
  def : Pat<(v32i8  (bitconvert (v4f64 VR256:$src))),  (v32i8 VR256:$src)>;
  def : Pat<(v32i8  (bitconvert (v4i64 VR256:$src))),  (v32i8 VR256:$src)>;
  def : Pat<(v32i8  (bitconvert (v8f32 VR256:$src))),  (v32i8 VR256:$src)>;
  def : Pat<(v32i8  (bitconvert (v8i32 VR256:$src))),  (v32i8 VR256:$src)>;
  def : Pat<(v32i8  (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
  def : Pat<(v8i32  (bitconvert (v32i8 VR256:$src))),  (v8i32 VR256:$src)>;
  def : Pat<(v8i32  (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
  def : Pat<(v8i32  (bitconvert (v8f32 VR256:$src))),  (v8i32 VR256:$src)>;
  def : Pat<(v8i32  (bitconvert (v4i64 VR256:$src))),  (v8i32 VR256:$src)>;
  def : Pat<(v8i32  (bitconvert (v4f64 VR256:$src))),  (v8i32 VR256:$src)>;
  def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))),  (v16i16 VR256:$src)>;
  def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))),  (v16i16 VR256:$src)>;
  def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))),  (v16i16 VR256:$src)>;
  def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))),  (v16i16 VR256:$src)>;
  def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
}

// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
// This is expanded by ExpandPostRAPseudos.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    isPseudo = 1 in {
  def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
                   [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>;
  def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
                   [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>;
}

//===----------------------------------------------------------------------===//
// AVX & SSE - Zero/One Vectors
//===----------------------------------------------------------------------===//

// Alias instruction that maps zero vector to pxor / xorp* for sse.
// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
// swizzled by ExecutionDepsFix to pxor.
// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-zeros value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    isPseudo = 1 in {
def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
}

def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
def : Pat<(v16i8 immAllZerosV), (V_SET0)>;


// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
// and doesn't need it because on sandy bridge the register is set to zero
// at the rename stage without using any execution unit, so SET0PSY
// and SET0PDY can be used for vector int instructions without penalty
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    isPseudo = 1, Predicates = [HasAVX] in {
def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
}

let Predicates = [HasAVX] in
  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;

let Predicates = [HasAVX2] in {
  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
}

// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
// VPXOR instruction writes zero to its upper part, it's safe build zeros.
let Predicates = [HasAVX1Only] in {
def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
          (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;

def : Pat<(v16i16 immAllZerosV), (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v16i16 (v8f32 immAllZerosV)),
          (SUBREG_TO_REG (i16 0), (V_SET0), sub_xmm)>;

def : Pat<(v8i32 immAllZerosV), (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
          (SUBREG_TO_REG (i32 0), (V_SET0), sub_xmm)>;

def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
          (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
}

// We set canFoldAsLoad because this can be converted to a constant-pool
// load of an all-ones value if folding it would be beneficial.
let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
    isPseudo = 1 in {
  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
  let Predicates = [HasAVX2] in
  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
}


//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move FP Scalar Instructions
//
// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
// register copies because it's a partial register update; FsMOVAPSrr/FsMOVAPDrr
// is used instead. Register-to-register movss/movsd is not modeled as an
// INSERT_SUBREG because INSERT_SUBREG requires that the insert be implementable
// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
//===----------------------------------------------------------------------===//

class sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, string asm> :
      SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm,
      [(set VR128:$dst, (vt (OpNode VR128:$src1,
                             (scalar_to_vector RC:$src2))))],
      IIC_SSE_MOV_S_RR>;

// Loading from memory automatically zeroing upper bits.
class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
                    PatFrag mem_pat, string OpcodeStr> :
      SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
         !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                        [(set RC:$dst, (mem_pat addr:$src))],
                        IIC_SSE_MOV_S_RM>;

// AVX
def VMOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
                "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
                VEX_LIG;
def VMOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
                "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
                VEX_LIG;

// For the disassembler
let isCodeGenOnly = 1 in {
  def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
                        (ins VR128:$src1, FR32:$src2),
                        "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
                        IIC_SSE_MOV_S_RR>,
                        XS, VEX_4V, VEX_LIG;
  def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
                        (ins VR128:$src1, FR64:$src2),
                        "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
                        IIC_SSE_MOV_S_RR>,
                        XD, VEX_4V, VEX_LIG;
}

let canFoldAsLoad = 1, isReMaterializable = 1 in {
  def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX,
                 VEX_LIG;
  let AddedComplexity = 20 in
    def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX,
                   VEX_LIG;
}

def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                  "movss\t{$src, $dst|$dst, $src}",
                  [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
                  XS, VEX, VEX_LIG;
def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                  "movsd\t{$src, $dst|$dst, $src}",
                  [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
                  XD, VEX, VEX_LIG;

// SSE1 & 2
let Constraints = "$src1 = $dst" in {
  def MOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
                          "movss\t{$src2, $dst|$dst, $src2}">, XS;
  def MOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
                          "movsd\t{$src2, $dst|$dst, $src2}">, XD;

  // For the disassembler
  let isCodeGenOnly = 1 in {
    def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
                         (ins VR128:$src1, FR32:$src2),
                         "movss\t{$src2, $dst|$dst, $src2}", [],
                         IIC_SSE_MOV_S_RR>, XS;
    def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
                         (ins VR128:$src1, FR64:$src2),
                         "movsd\t{$src2, $dst|$dst, $src2}", [],
                         IIC_SSE_MOV_S_RR>, XD;
  }
}

let canFoldAsLoad = 1, isReMaterializable = 1 in {
  def MOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;

  let AddedComplexity = 20 in
    def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
}

def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
                  "movss\t{$src, $dst|$dst, $src}",
                  [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
                  "movsd\t{$src, $dst|$dst, $src}",
                  [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;

// Patterns
let Predicates = [HasAVX] in {
  let AddedComplexity = 15 in {
  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
  // MOVS{S,D} to the lower bits.
  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
            (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
            (VMOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
            (VMOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
            (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;

  // Move low f32 and clear high bits.
  def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
            (SUBREG_TO_REG (i32 0),
             (VMOVSSrr (v4f32 (V_SET0)),
                       (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm)), sub_xmm)>;
  def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
            (SUBREG_TO_REG (i32 0),
             (VMOVSSrr (v4i32 (V_SET0)),
                       (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), sub_xmm)>;
  }

  let AddedComplexity = 20 in {
  // MOVSSrm zeros the high parts of the register; represent this
  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
            (COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;

  // MOVSDrm zeros the high parts of the register; represent this
  // with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
  def : Pat<(v2f64 (X86vzload addr:$src)),
            (COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;

  // Represent the same patterns above but in the form they appear for
  // 256-bit types
  def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
            (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
            (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
  }
  def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
            (SUBREG_TO_REG (i32 0),
                           (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
                           sub_xmm)>;
  def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
            (SUBREG_TO_REG (i64 0),
                           (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
                           sub_xmm)>;
  def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
            (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;

  // Move low f64 and clear high bits.
  def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
            (SUBREG_TO_REG (i32 0),
             (VMOVSDrr (v2f64 (V_SET0)),
                       (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), sub_xmm)>;

  def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
            (SUBREG_TO_REG (i32 0),
             (VMOVSDrr (v2i64 (V_SET0)),
                       (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), sub_xmm)>;

  // Extract and store.
  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
                   addr:$dst),
            (VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;
  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
                   addr:$dst),
            (VMOVSDmr addr:$dst, (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64))>;

  // Shuffle with VMOVSS
  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
            (VMOVSSrr (v4i32 VR128:$src1),
                      (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
            (VMOVSSrr (v4f32 VR128:$src1),
                      (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;

  // 256-bit variants
  def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
            (SUBREG_TO_REG (i32 0),
              (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
                        (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
              sub_xmm)>;
  def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
            (SUBREG_TO_REG (i32 0),
              (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
                        (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
              sub_xmm)>;

  // Shuffle with VMOVSD
  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;

  // 256-bit variants
  def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
            (SUBREG_TO_REG (i32 0),
              (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
                        (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
              sub_xmm)>;
  def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
            (SUBREG_TO_REG (i32 0),
              (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
                        (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
              sub_xmm)>;


  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
  // is during lowering, where it's not possible to recognize the fold cause
  // it has two uses through a bitcast. One use disappears at isel time and the
  // fold opportunity reappears.
  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
}

let Predicates = [UseSSE1] in {
  let AddedComplexity = 15 in {
  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
  // MOVSS to the lower bits.
  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
            (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
  def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
            (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
  def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
            (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
  }

  let AddedComplexity = 20 in {
  // MOVSSrm already zeros the high parts of the register.
  def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
  def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
  def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
            (COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
  }

  // Extract and store.
  def : Pat<(store (f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
                   addr:$dst),
            (MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;

  // Shuffle with MOVSS
  def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
  def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
            (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
}

let Predicates = [UseSSE2] in {
  let AddedComplexity = 15 in {
  // Move scalar to XMM zero-extended, zeroing a VR128 then do a
  // MOVSD to the lower bits.
  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
            (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
  }

  let AddedComplexity = 20 in {
  // MOVSDrm already zeros the high parts of the register.
  def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
  def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
  def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
  def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
  def : Pat<(v2f64 (X86vzload addr:$src)),
            (COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
  }

  // Extract and store.
  def : Pat<(store (f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
                   addr:$dst),
            (MOVSDmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR64))>;

  // Shuffle with MOVSD
  def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;

  // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
  // is during lowering, where it's not possible to recognize the fold cause
  // it has two uses through a bitcast. One use disappears at isel time and the
  // fold opportunity reappears.
  def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
  def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
}

//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
//===----------------------------------------------------------------------===//

multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
                            X86MemOperand x86memop, PatFrag ld_frag,
                            string asm, Domain d,
                            OpndItins itins,
                            bit IsReMaterializable = 1> {
let neverHasSideEffects = 1 in
  def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
              !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>;
let canFoldAsLoad = 1, isReMaterializable = IsReMaterializable in
  def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
                   [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>;
}

defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
                              TB, VEX;
defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
                              TB, OpSize, VEX;
defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
                              TB, VEX;
defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
                              TB, OpSize, VEX;

defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
                              TB, VEX;
defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
                              TB, OpSize, VEX;
defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
                              TB, VEX;
defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
                              TB, OpSize, VEX;
defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                              "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
                              TB;
defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
                              "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
                              TB, OpSize;
defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
                              "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
                              TB;
defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                              "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
                              TB, OpSize;

def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                   "movaps\t{$src, $dst|$dst, $src}",
                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
                   IIC_SSE_MOVA_P_MR>, VEX;
def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                   "movapd\t{$src, $dst|$dst, $src}",
                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
                   IIC_SSE_MOVA_P_MR>, VEX;
def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                   "movups\t{$src, $dst|$dst, $src}",
                   [(store (v4f32 VR128:$src), addr:$dst)],
                   IIC_SSE_MOVU_P_MR>, VEX;
def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                   "movupd\t{$src, $dst|$dst, $src}",
                   [(store (v2f64 VR128:$src), addr:$dst)],
                   IIC_SSE_MOVU_P_MR>, VEX;
def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                   "movaps\t{$src, $dst|$dst, $src}",
                   [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
                   IIC_SSE_MOVA_P_MR>, VEX;
def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                   "movapd\t{$src, $dst|$dst, $src}",
                   [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
                   IIC_SSE_MOVA_P_MR>, VEX;
def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                   "movups\t{$src, $dst|$dst, $src}",
                   [(store (v8f32 VR256:$src), addr:$dst)],
                   IIC_SSE_MOVU_P_MR>, VEX;
def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                   "movupd\t{$src, $dst|$dst, $src}",
                   [(store (v4f64 VR256:$src), addr:$dst)],
                   IIC_SSE_MOVU_P_MR>, VEX;

// For disassembler
let isCodeGenOnly = 1 in {
  def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
                          (ins VR128:$src),
                          "movaps\t{$src, $dst|$dst, $src}", [],
                          IIC_SSE_MOVA_P_RR>, VEX;
  def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
                           "movapd\t{$src, $dst|$dst, $src}", [],
                           IIC_SSE_MOVA_P_RR>, VEX;
  def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
                           "movups\t{$src, $dst|$dst, $src}", [],
                           IIC_SSE_MOVU_P_RR>, VEX;
  def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
                           "movupd\t{$src, $dst|$dst, $src}", [],
                           IIC_SSE_MOVU_P_RR>, VEX;
  def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                            (ins VR256:$src),
                            "movaps\t{$src, $dst|$dst, $src}", [],
                            IIC_SSE_MOVA_P_RR>, VEX;
  def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                            (ins VR256:$src),
                            "movapd\t{$src, $dst|$dst, $src}", [],
                            IIC_SSE_MOVA_P_RR>, VEX;
  def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                            (ins VR256:$src),
                            "movups\t{$src, $dst|$dst, $src}", [],
                            IIC_SSE_MOVU_P_RR>, VEX;
  def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                            (ins VR256:$src),
                            "movupd\t{$src, $dst|$dst, $src}", [],
                            IIC_SSE_MOVU_P_RR>, VEX;
}

let Predicates = [HasAVX] in {
def : Pat<(v8i32 (X86vzmovl
                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl
                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
def : Pat<(v8f32 (X86vzmovl
                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl
                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
          (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
}


def : Pat<(int_x86_avx_storeu_ps_256 addr:$dst, VR256:$src),
          (VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(int_x86_avx_storeu_pd_256 addr:$dst, VR256:$src),
          (VMOVUPDYmr addr:$dst, VR256:$src)>;

def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                   "movaps\t{$src, $dst|$dst, $src}",
                   [(alignedstore (v4f32 VR128:$src), addr:$dst)],
                   IIC_SSE_MOVA_P_MR>;
def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                   "movapd\t{$src, $dst|$dst, $src}",
                   [(alignedstore (v2f64 VR128:$src), addr:$dst)],
                   IIC_SSE_MOVA_P_MR>;
def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                   "movups\t{$src, $dst|$dst, $src}",
                   [(store (v4f32 VR128:$src), addr:$dst)],
                   IIC_SSE_MOVU_P_MR>;
def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                   "movupd\t{$src, $dst|$dst, $src}",
                   [(store (v2f64 VR128:$src), addr:$dst)],
                   IIC_SSE_MOVU_P_MR>;

// For disassembler
let isCodeGenOnly = 1 in {
  def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movaps\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVA_P_RR>;
  def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movapd\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVA_P_RR>;
  def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movups\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>;
  def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movupd\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>;
}

let Predicates = [HasAVX] in {
  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
            (VMOVUPSmr addr:$dst, VR128:$src)>;
  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
            (VMOVUPDmr addr:$dst, VR128:$src)>;
}

let Predicates = [UseSSE1] in
  def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
            (MOVUPSmr addr:$dst, VR128:$src)>;
let Predicates = [UseSSE2] in
  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
            (MOVUPDmr addr:$dst, VR128:$src)>;

// Use vmovaps/vmovups for AVX integer load/store.
let Predicates = [HasAVX] in {
  // 128-bit load/store
  def : Pat<(alignedloadv2i64 addr:$src),
            (VMOVAPSrm addr:$src)>;
  def : Pat<(loadv2i64 addr:$src),
            (VMOVUPSrm addr:$src)>;

  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
            (VMOVAPSmr addr:$dst, VR128:$src)>;
  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
            (VMOVAPSmr addr:$dst, VR128:$src)>;
  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
            (VMOVAPSmr addr:$dst, VR128:$src)>;
  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
            (VMOVAPSmr addr:$dst, VR128:$src)>;
  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
            (VMOVUPSmr addr:$dst, VR128:$src)>;
  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
            (VMOVUPSmr addr:$dst, VR128:$src)>;
  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
            (VMOVUPSmr addr:$dst, VR128:$src)>;
  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
            (VMOVUPSmr addr:$dst, VR128:$src)>;