Skip to content
X86InstrSSE.td 188 KiB
Newer Older
let AddedComplexity = 20 in {
// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
          (MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
          (MOVLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
          (MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
          (MOVLPDrm VR128:$src1, addr:$src2)>;
// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
          (MOVLPSmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
          (MOVLPDmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
                 addr:$src1),
          (MOVLPSmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
          (MOVLPDmr addr:$src1, VR128:$src2)>;
// Setting the lowest element in the vector.
def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
Dan Gohman's avatar
Dan Gohman committed
          (MOVSSrr (v4i32 VR128:$src1),
                   (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_ss))>;
def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
Dan Gohman's avatar
Dan Gohman committed
          (MOVSDrr (v2i64 VR128:$src1),
                   (EXTRACT_SUBREG (v2i64 VR128:$src2), sub_sd))>;
Dan Gohman's avatar
Dan Gohman committed
// vector_shuffle v1, v2 <4, 5, 2, 3> using movsd
def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
Dan Gohman's avatar
Dan Gohman committed
      Requires<[HasSSE2]>;
def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
          (MOVSDrr VR128:$src1, (EXTRACT_SUBREG VR128:$src2, sub_sd))>,
Dan Gohman's avatar
Dan Gohman committed
      Requires<[HasSSE2]>;
// vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
// fall back to this for SSE1)
def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
          (SHUFPSrri VR128:$src2, VR128:$src1,
                     (SHUFFLE_get_shuf_imm VR128:$src3))>;
// Set lowest element and zero upper elements.
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
// Some special case pandn patterns.
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
                  VR128:$src2)),
          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
                  VR128:$src2)),
          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
                  VR128:$src2)),
          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;

def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
// vector -> vector casts
def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
          (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
          (Int_CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
def : Pat<(v2f64 (sint_to_fp (v2i32 VR64:$src))),
          (Int_CVTPI2PDrr VR64:$src)>, Requires<[HasSSE2]>;
def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
          (Int_CVTTPD2PIrr VR128:$src)>, Requires<[HasSSE2]>;
// Use movaps / movups for SSE integer load / store (one byte shorter).
def : Pat<(alignedloadv4i32 addr:$src),
          (MOVAPSrm addr:$src)>;
def : Pat<(loadv4i32 addr:$src),
          (MOVUPSrm addr:$src)>;
def : Pat<(alignedloadv2i64 addr:$src),
          (MOVAPSrm addr:$src)>;
          (MOVUPSrm addr:$src)>;

def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
          (MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
          (MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
          (MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
          (MOVAPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v2i64 VR128:$src), addr:$dst),
          (MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v4i32 VR128:$src), addr:$dst),
          (MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v8i16 VR128:$src), addr:$dst),
          (MOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
          (MOVUPSmr addr:$dst, VR128:$src)>;
//===----------------------------------------------------------------------===//
// SSE4.1 Instructions
//===----------------------------------------------------------------------===//

multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
                            string OpcodeStr,
                            Intrinsic V4F32Int,
                            Intrinsic V2F64Int> {
  // Intrinsic operation, reg.
  // Vector intrinsic operation, reg
  def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                    !strconcat(OpcodeStr,
                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
                    OpSize;

  // Vector intrinsic operation, mem
                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
                    !strconcat(OpcodeStr,
                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR128:$dst,
                          (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,

  // Vector intrinsic operation, reg
  def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
                    !strconcat(OpcodeStr,
                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
                    OpSize;

  // Vector intrinsic operation, mem
  def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
                    !strconcat(OpcodeStr,
                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR128:$dst,
                          (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
let Constraints = "$src1 = $dst" in {
multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
                            string OpcodeStr,
                            Intrinsic F32Int,
                            Intrinsic F64Int> {
  // Intrinsic operation, reg.
  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
                    (outs VR128:$dst),
                                 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
                    "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR128:$dst,
                            (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
                    OpSize;

  // Intrinsic operation, mem.
  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
                    (outs VR128:$dst),
                                (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
                    "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR128:$dst,
                         (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
                    OpSize;

  // Intrinsic operation, reg.
  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
                    (outs VR128:$dst),
                            (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
                    "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR128:$dst,
                            (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
                    OpSize;

  // Intrinsic operation, mem.
  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
                    (outs VR128:$dst),
                            (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
                    "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR128:$dst,
                        (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
                    OpSize;
}
}

// FP round - roundss, roundps, roundsd, roundpd
defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round",
                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;

// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
                                 Intrinsic IntId128> {
  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins i128mem:$src),
                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst,
                       (IntId128
                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
}

defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
                                         int_x86_sse41_phminposuw>;

/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
let Constraints = "$src1 = $dst" in {
  multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
                                Intrinsic IntId128, bit Commutable = 0> {
    def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                   (ins VR128:$src1, VR128:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
                   OpSize {
      let isCommutable = Commutable;
    }
    def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                   (ins VR128:$src1, i128mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR128:$dst,
                     (IntId128 VR128:$src1,
                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
  }
}

defm PCMPEQQ      : SS41I_binop_rm_int<0x29, "pcmpeqq",
                                       int_x86_sse41_pcmpeqq, 1>;
defm PACKUSDW     : SS41I_binop_rm_int<0x2B, "packusdw",
                                       int_x86_sse41_packusdw, 0>;
defm PMINSB       : SS41I_binop_rm_int<0x38, "pminsb",
                                       int_x86_sse41_pminsb, 1>;
defm PMINSD       : SS41I_binop_rm_int<0x39, "pminsd",
                                       int_x86_sse41_pminsd, 1>;
defm PMINUD       : SS41I_binop_rm_int<0x3B, "pminud",
                                       int_x86_sse41_pminud, 1>;
defm PMINUW       : SS41I_binop_rm_int<0x3A, "pminuw",
                                       int_x86_sse41_pminuw, 1>;
defm PMAXSB       : SS41I_binop_rm_int<0x3C, "pmaxsb",
                                       int_x86_sse41_pmaxsb, 1>;
defm PMAXSD       : SS41I_binop_rm_int<0x3D, "pmaxsd",
                                       int_x86_sse41_pmaxsd, 1>;
defm PMAXUD       : SS41I_binop_rm_int<0x3F, "pmaxud",
                                       int_x86_sse41_pmaxud, 1>;
defm PMAXUW       : SS41I_binop_rm_int<0x3E, "pmaxuw",
                                       int_x86_sse41_pmaxuw, 1>;
defm PMULDQ       : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, 1>;

def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
          (PCMPEQQrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
          (PCMPEQQrm VR128:$src1, addr:$src2)>;

/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
let Constraints = "$src1 = $dst" in {
  multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
                                SDNode OpNode, Intrinsic IntId128,
                                bit Commutable = 0> {
    def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                   (ins VR128:$src1, VR128:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
                                                   VR128:$src2))]>, OpSize {
      let isCommutable = Commutable;
    }
    def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                      (ins VR128:$src1, VR128:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
                      OpSize {
      let isCommutable = Commutable;
    }
    def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                   (ins VR128:$src1, i128mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR128:$dst,
Chris Lattner's avatar
Chris Lattner committed
                     (OpVT (OpNode VR128:$src1, (memop addr:$src2))))]>, OpSize;
    def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                       (ins VR128:$src1, i128mem:$src2),
                       !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                       [(set VR128:$dst,
                        (IntId128 VR128:$src1, (memop addr:$src2)))]>,

/// SS48I_binop_rm - Simple SSE41 binary operator.
let Constraints = "$src1 = $dst" in {
multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                        ValueType OpVT, bit Commutable = 0> {
  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                                 (ins VR128:$src1, VR128:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]>,
               OpSize {
    let isCommutable = Commutable;
  }
  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                                 (ins VR128:$src1, i128mem:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
               [(set VR128:$dst, (OpNode VR128:$src1,
                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>,
               OpSize;
}
}

defm PMULLD         : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, 1>;
/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
let Constraints = "$src1 = $dst" in {
  multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
                                 Intrinsic IntId128, bit Commutable = 0> {
    def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
Nate Begeman's avatar
Nate Begeman committed
                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR128:$dst,
                      (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
                    OpSize {
    def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
                    !strconcat(OpcodeStr,
Nate Begeman's avatar
Nate Begeman committed
                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR128:$dst,
                      (IntId128 VR128:$src1,
                       (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
                    OpSize;
  }
}

defm BLENDPS      : SS41I_binop_rmi_int<0x0C, "blendps",
                                        int_x86_sse41_blendps, 0>;
defm BLENDPD      : SS41I_binop_rmi_int<0x0D, "blendpd",
                                        int_x86_sse41_blendpd, 0>;
defm PBLENDW      : SS41I_binop_rmi_int<0x0E, "pblendw",
                                        int_x86_sse41_pblendw, 0>;
defm DPPS         : SS41I_binop_rmi_int<0x40, "dpps",
                                        int_x86_sse41_dpps, 1>;
defm DPPD         : SS41I_binop_rmi_int<0x41, "dppd",
                                        int_x86_sse41_dppd, 1>;
defm MPSADBW      : SS41I_binop_rmi_int<0x42, "mpsadbw",
                                        int_x86_sse41_mpsadbw, 0>;
/// SS41I_ternary_int - SSE 4.1 ternary operator
let Uses = [XMM0], Constraints = "$src1 = $dst" in {
Nate Begeman's avatar
Nate Begeman committed
  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2),
                    !strconcat(OpcodeStr,
Nate Begeman's avatar
Nate Begeman committed
                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
                    OpSize;

    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, i128mem:$src2),
                    !strconcat(OpcodeStr,
                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
                    [(set VR128:$dst,
                      (IntId VR128:$src1,
                       (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
  }
}

defm BLENDVPD     : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
defm BLENDVPS     : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;


multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;

  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
       [(set VR128:$dst,
         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
       OpSize;
}

defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;

// Common patterns involving scalar load.
def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;

def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;

def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;

def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;

def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;

def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;


multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;

  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
       [(set VR128:$dst,
         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
          OpSize;
}

defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;

// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
          (PMOVSXBDrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
          (PMOVSXWQrm addr:$src)>, Requires<[HasSSE41]>;

def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
          (PMOVZXBDrm addr:$src)>, Requires<[HasSSE41]>;
def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
          (PMOVZXWQrm addr:$src)>, Requires<[HasSSE41]>;
multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;

  // Expecting a i16 load any extended to i32 value.
  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                 [(set VR128:$dst, (IntId (bitconvert
                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
                 OpSize;
}

defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
Eli Friedman's avatar
Eli Friedman committed
defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbq
            (bitconvert (v4i32 (X86vzmovl
                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
          (PMOVSXBQrm addr:$src)>, Requires<[HasSSE41]>;

def : Pat<(int_x86_sse41_pmovzxbq
            (bitconvert (v4i32 (X86vzmovl
                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
          (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
                 (ins VR128:$src1, i32i8imm:$src2),
                 !strconcat(OpcodeStr,
                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
                 OpSize;
  def mr : SS4AIi8<opc, MRMDestMem, (outs),
                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
                 !strconcat(OpcodeStr,
                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 []>, OpSize;
// FIXME:
// There's an AssertZext in the way of writing the store pattern
// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
}

defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;


/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
  def mr : SS4AIi8<opc, MRMDestMem, (outs),
                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
                 !strconcat(OpcodeStr,
                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 []>, OpSize;
// FIXME:
// There's an AssertZext in the way of writing the store pattern
// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;

/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
                 (ins VR128:$src1, i32i8imm:$src2),
                 !strconcat(OpcodeStr,
                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set GR32:$dst,
                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
  def mr : SS4AIi8<opc, MRMDestMem, (outs),
                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
                 !strconcat(OpcodeStr,
                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
                          addr:$dst)]>, OpSize;
defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;

/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
/// destination
multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
                 (ins VR128:$src1, i32i8imm:$src2),
                 !strconcat(OpcodeStr,
                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(set GR32:$dst,
                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
  def mr : SS4AIi8<opc, MRMDestMem, (outs),
                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
                 !strconcat(OpcodeStr,
                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;

// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
                                              imm:$src2))),
                 addr:$dst),
          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
         Requires<[HasSSE41]>;

let Constraints = "$src1 = $dst" in {
  multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
                   (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
                   !strconcat(OpcodeStr,
                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                   [(set VR128:$dst,
                     (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
                   (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
                   !strconcat(OpcodeStr,
                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                   [(set VR128:$dst,
                     (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
                                imm:$src3))]>, OpSize;
  }
}

defm PINSRB      : SS41I_insert8<0x20, "pinsrb">;

let Constraints = "$src1 = $dst" in {
  multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
                   (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
                   !strconcat(OpcodeStr,
                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                   [(set VR128:$dst,
                     (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
                   OpSize;
    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
                   (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
                   !strconcat(OpcodeStr,
                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                   [(set VR128:$dst,
                     (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
                                       imm:$src3)))]>, OpSize;
  }
}

defm PINSRD      : SS41I_insert32<0x22, "pinsrd">;

// insertps has a few different modes, there's the first two here below which
// are optimized inserts that won't zero arbitrary elements in the destination
// vector. The next one matches the intrinsic and could zero arbitrary elements
// in the target vector.
let Constraints = "$src1 = $dst" in {
  multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
                   (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
                   !strconcat(OpcodeStr,
                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                   [(set VR128:$dst,
                     (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
                   (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
                   !strconcat(OpcodeStr,
                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                   [(set VR128:$dst,
                     (X86insrtps VR128:$src1,
                                (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                                 imm:$src3))]>, OpSize;
  }
}
defm INSERTPS    : SS41I_insertf32<0x21, "insertps">;
def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
          (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>;

// ptest instruction we'll lower to this in X86ISelLowering primarily from
// the intel intrinsic that corresponds to this.
let Defs = [EFLAGS] in {
def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                    "ptest \t{$src2, $src1|$src1, $src2}",
                    [(set EFLAGS, (X86ptest VR128:$src1, VR128:$src2))]>,
              OpSize;
def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
                    "ptest \t{$src2, $src1|$src1, $src2}",
                    [(set EFLAGS, (X86ptest VR128:$src1, (load addr:$src2)))]>,
              OpSize;
}

def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                       "movntdqa\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
                       OpSize;

//===----------------------------------------------------------------------===//
// SSE4.2 Instructions
//===----------------------------------------------------------------------===//

/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
let Constraints = "$src1 = $dst" in {
  multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
                                Intrinsic IntId128, bit Commutable = 0> {
    def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
                   (ins VR128:$src1, VR128:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
                   OpSize {
      let isCommutable = Commutable;
    }
    def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
                   (ins VR128:$src1, i128mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR128:$dst,
                     (IntId128 VR128:$src1,
                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
  }
}

Nate Begeman's avatar
Nate Begeman committed
defm PCMPGTQ      : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;

def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
          (PCMPGTQrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
          (PCMPGTQrm VR128:$src1, addr:$src2)>;

// crc intrinsic instruction
// This set of instructions are only rm, the only difference is the size
// of r and m.
let Constraints = "$src1 = $dst" in {
  def CRC32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
                       [(set GR32:$dst,
                         (int_x86_sse42_crc32_8 GR32:$src1,
  def CRC32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
                         (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>;
  def CRC32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
                      (ins GR32:$src1, i16mem:$src2),
                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
                       [(set GR32:$dst,
                         (int_x86_sse42_crc32_16 GR32:$src1,
                         (load addr:$src2)))]>,
                         OpSize;
  def CRC32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
                      "crc32{w} \t{$src2, $src1|$src1, $src2}",
                         (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>,
  def CRC32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
                      (ins GR32:$src1, i32mem:$src2),
                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
                       [(set GR32:$dst,
                         (int_x86_sse42_crc32_32 GR32:$src1,
  def CRC32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
                      "crc32{l} \t{$src2, $src1|$src1, $src2}",
                         (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>;
  def CRC64m8  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
                      (ins GR64:$src1, i8mem:$src2),
                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
                       [(set GR64:$dst,
                         (int_x86_sse42_crc64_8 GR64:$src1,
                         (load addr:$src2)))]>,
                         REX_W;
  def CRC64r8  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
                      (ins GR64:$src1, GR8:$src2),
                      "crc32{b} \t{$src2, $src1|$src1, $src2}",
                       [(set GR64:$dst,
                         (int_x86_sse42_crc64_8 GR64:$src1, GR8:$src2))]>,
                         REX_W;
  def CRC64m64  : SS42FI<0xF1, MRMSrcMem, (outs GR64:$dst),
                      (ins GR64:$src1, i64mem:$src2),
                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
                         REX_W;
  def CRC64r64  : SS42FI<0xF1, MRMSrcReg, (outs GR64:$dst),
                      "crc32{q} \t{$src2, $src1|$src1, $src2}",
                         (int_x86_sse42_crc64_64 GR64:$src1, GR64:$src2))]>,
                         REX_W;

// String/text processing instructions.
let Defs = [EFLAGS], usesCustomInserter = 1 in {
def PCMPISTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
  (ins VR128:$src1, VR128:$src2, i8imm:$src3),
  "#PCMPISTRM128rr PSEUDO!",
  [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
                                                imm:$src3))]>, OpSize;
def PCMPISTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
  (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
  "#PCMPISTRM128rm PSEUDO!",
  [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, (load addr:$src2),
                                                imm:$src3))]>, OpSize;
}

let Defs = [XMM0, EFLAGS] in {
def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
  (ins VR128:$src1, VR128:$src2, i8imm:$src3),
   "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
  (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
  "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
def PCMPESTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
  (ins VR128:$src1, VR128:$src3, i8imm:$src5),
  "#PCMPESTRM128rr PSEUDO!",
  [(set VR128:$dst,
        (int_x86_sse42_pcmpestrm128
         VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>, OpSize;

def PCMPESTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
  (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
  "#PCMPESTRM128rm PSEUDO!",
  [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
                     VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>,
}

let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
  (ins VR128:$src1, VR128:$src3, i8imm:$src5),
  "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
  (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
  "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
}

let Defs = [ECX, EFLAGS] in {
  multiclass SS42AI_pcmpistri<Intrinsic IntId128> {
    def rr : SS42AI<0x63, MRMSrcReg, (outs),
      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
      "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
      [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
       (implicit EFLAGS)]>, OpSize;
    def rm : SS42AI<0x63, MRMSrcMem, (outs),
      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
      "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
      [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
       (implicit EFLAGS)]>, OpSize;
  }
}

defm PCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;

let Defs = [ECX, EFLAGS] in {
let Uses = [EAX, EDX] in {
  multiclass SS42AI_pcmpestri<Intrinsic IntId128> {
    def rr : SS42AI<0x61, MRMSrcReg, (outs),
      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
      "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
      [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
       (implicit EFLAGS)]>, OpSize;
    def rm : SS42AI<0x61, MRMSrcMem, (outs),
      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
       "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
             (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)),
        (implicit EFLAGS)]>, OpSize;
  }
}
}

defm PCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;

//===----------------------------------------------------------------------===//
// AES-NI Instructions
//===----------------------------------------------------------------------===//

let Constraints = "$src1 = $dst" in {
  multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
                                Intrinsic IntId128, bit Commutable = 0> {
    def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
                   (ins VR128:$src1, VR128:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
                   OpSize {
      let isCommutable = Commutable;
    }
    def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
                   (ins VR128:$src1, i128mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
                   [(set VR128:$dst,
                     (IntId128 VR128:$src1,
                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
  }
}

defm AESENC          : AESI_binop_rm_int<0xDC, "aesenc",
                       int_x86_aesni_aesenc>;
defm AESENCLAST      : AESI_binop_rm_int<0xDD, "aesenclast",
                       int_x86_aesni_aesenclast>;
defm AESDEC          : AESI_binop_rm_int<0xDE, "aesdec",
                       int_x86_aesni_aesdec>;
defm AESDECLAST      : AESI_binop_rm_int<0xDF, "aesdeclast",
                       int_x86_aesni_aesdeclast>;

def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, VR128:$src2)),
          (AESENCrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (int_x86_aesni_aesenc VR128:$src1, (memop addr:$src2))),
          (AESENCrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, VR128:$src2)),
          (AESENCLASTrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (int_x86_aesni_aesenclast VR128:$src1, (memop addr:$src2))),
          (AESENCLASTrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, VR128:$src2)),
          (AESDECrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (int_x86_aesni_aesdec VR128:$src1, (memop addr:$src2))),
          (AESDECrm VR128:$src1, addr:$src2)>;
def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, VR128:$src2)),
          (AESDECLASTrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (int_x86_aesni_aesdeclast VR128:$src1, (memop addr:$src2))),
          (AESDECLASTrm VR128:$src1, addr:$src2)>;

def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
  (ins VR128:$src1),
  "aesimc\t{$src1, $dst|$dst, $src1}",
  [(set VR128:$dst,
    (int_x86_aesni_aesimc VR128:$src1))]>,
  OpSize;

def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
  (ins i128mem:$src1),
  "aesimc\t{$src1, $dst|$dst, $src1}",
  [(set VR128:$dst,
    (int_x86_aesni_aesimc (bitconvert (memopv2i64 addr:$src1))))]>,
  OpSize;

def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
  (ins VR128:$src1, i8imm:$src2),
  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  [(set VR128:$dst,
    (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
  OpSize;
def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
  (ins i128mem:$src1, i8imm:$src2),
  "aeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
  [(set VR128:$dst,
    (int_x86_aesni_aeskeygenassist (bitconvert (memopv2i64 addr:$src1)),
                                    imm:$src2))]>,
  OpSize;