diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index ea6a28d7d75d9ced2481970c88bbd8e7d31a4e57..1e7809c302ed003db0677d39f0a270c8a14f7b40 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -7219,7 +7219,12 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr }, { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, - { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr } + { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }, + // AVX512 support + { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr }, + { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, + { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, + { X86::VMOVNTPSZmr, X86::VMOVNTPDZmr, X86::VMOVNTDQZmr }, }; static const uint16_t ReplaceableInstrsAVX2[][3] = { @@ -7246,6 +7251,40 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm} }; +static const uint16_t ReplaceableInstrsAVX512[][4] = { + // Two integer columns for 64-bit and 32-bit elements. + //PackedSingle PackedDouble PackedInt PackedInt + { X86::VANDNPSZ128rm, X86::VANDNPDZ128rm, X86::VPANDNQZ128rm, X86::VPANDNDZ128rm }, + { X86::VANDNPSZ128rr, X86::VANDNPDZ128rr, X86::VPANDNQZ128rr, X86::VPANDNDZ128rr }, + { X86::VANDPSZ128rm, X86::VANDPDZ128rm, X86::VPANDQZ128rm, X86::VPANDDZ128rm }, + { X86::VANDPSZ128rr, X86::VANDPDZ128rr, X86::VPANDQZ128rr, X86::VPANDDZ128rr }, + { X86::VORPSZ128rm, X86::VORPDZ128rm, X86::VPORQZ128rm, X86::VPORDZ128rm }, + { X86::VORPSZ128rr, X86::VORPDZ128rr, X86::VPORQZ128rr, X86::VPORDZ128rr }, + { X86::VXORPSZ128rm, X86::VXORPDZ128rm, X86::VPXORQZ128rm, X86::VPXORDZ128rm }, + { X86::VXORPSZ128rr, X86::VXORPDZ128rr, X86::VPXORQZ128rr, X86::VPXORDZ128rr }, + { X86::VANDNPSZ256rm, X86::VANDNPDZ256rm, X86::VPANDNQZ256rm, X86::VPANDNDZ256rm }, + { X86::VANDNPSZ256rr, X86::VANDNPDZ256rr, X86::VPANDNQZ256rr, X86::VPANDNDZ256rr }, + { X86::VANDPSZ256rm, X86::VANDPDZ256rm, X86::VPANDQZ256rm, X86::VPANDDZ256rm }, + { X86::VANDPSZ256rr, X86::VANDPDZ256rr, X86::VPANDQZ256rr, X86::VPANDDZ256rr }, + { X86::VORPSZ256rm, X86::VORPDZ256rm, X86::VPORQZ256rm, X86::VPORDZ256rm }, + { X86::VORPSZ256rr, X86::VORPDZ256rr, X86::VPORQZ256rr, X86::VPORDZ256rr }, + { X86::VXORPSZ256rm, X86::VXORPDZ256rm, X86::VPXORQZ256rm, X86::VPXORDZ256rm }, + { X86::VXORPSZ256rr, X86::VXORPDZ256rr, X86::VPXORQZ256rr, X86::VPXORDZ256rr }, + { X86::VANDNPSZrm, X86::VANDNPDZrm, X86::VPANDNQZrm, X86::VPANDNDZrm }, + { X86::VANDNPSZrr, X86::VANDNPDZrr, X86::VPANDNQZrr, X86::VPANDNDZrr }, + { X86::VANDPSZrm, X86::VANDPDZrm, X86::VPANDQZrm, X86::VPANDDZrm }, + { X86::VANDPSZrr, X86::VANDPDZrr, X86::VPANDQZrr, X86::VPANDDZrr }, + { X86::VORPSZrm, X86::VORPDZrm, X86::VPORQZrm, X86::VPORDZrm }, + { X86::VORPSZrr, X86::VORPDZrr, X86::VPORQZrr, X86::VPORDZrr }, + { X86::VXORPSZrm, X86::VXORPDZrm, X86::VPXORQZrm, X86::VPXORDZrm }, + { X86::VXORPSZrr, X86::VXORPDZrr, X86::VPXORQZrr, X86::VPXORDZrr }, + { X86::VMOVAPSZmr, X86::VMOVAPDZmr, X86::VMOVDQA64Zmr, X86::VMOVDQA64Zmr }, + { X86::VMOVAPSZrm, X86::VMOVAPDZrm, X86::VMOVDQA64Zrm, X86::VMOVDQA64Zrm }, + { X86::VMOVAPSZrr, X86::VMOVAPDZrr, X86::VMOVDQA64Zrr, X86::VMOVDQA64Zrr }, + { X86::VMOVUPSZmr, X86::VMOVUPDZmr, X86::VMOVDQU64Zmr, X86::VMOVDQU64Zmr }, + { X86::VMOVUPSZrm, X86::VMOVUPDZrm, X86::VMOVDQU64Zrm, X86::VMOVDQU64Zrm }, +}; + // FIXME: Some shuffle and unpack instructions have equivalents in different // domains, but they require a bit more work than just switching opcodes. @@ -7263,6 +7302,14 @@ static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) { return nullptr; } +static const uint16_t *lookupAVX512(unsigned opcode, unsigned domain) { + // If this is the integer domain make sure to check both integer columns. + for (const uint16_t (&Row)[4] : ReplaceableInstrsAVX512) + if (Row[domain-1] == opcode || (domain == 3 && Row[3] == opcode)) + return Row; + return nullptr; +} + std::pair X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { uint16_t domain = (MI.getDesc().TSFlags >> X86II::SSEDomainShift) & 3; @@ -7272,6 +7319,8 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const { validDomains = 0xe; else if (domain && lookupAVX2(MI.getOpcode(), domain)) validDomains = hasAVX2 ? 0xe : 0x6; + else if (domain && lookupAVX512(MI.getOpcode(), domain)) + validDomains = 0xe; return std::make_pair(domain, validDomains); } @@ -7285,6 +7334,12 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const { "256-bit vector operations only available in AVX2"); table = lookupAVX2(MI.getOpcode(), dom); } + if (!table) { // try the AVX512 table + table = lookupAVX512(MI.getOpcode(), dom); + // Don't change integer Q instructions to D instructions. + if (dom == 3 && table[3] == MI.getOpcode()) + Domain = 4; + } assert(table && "Cannot change domain"); MI.setDesc(get(table[Domain - 1])); } diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 50c9c25a27c086d081ef772bf663d0dd8ff00ce6..d58882d74696135d8514c36fceb88b1ae6f3e411 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -321,7 +321,7 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); } void X86PassConfig::addPreEmitPass() { if (getOptLevel() != CodeGenOpt::None) - addPass(createExecutionDependencyFixPass(&X86::VR128RegClass)); + addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass)); if (UseVZeroUpper) addPass(createX86IssueVZeroUpperPass()); diff --git a/llvm/test/CodeGen/X86/avx512-arith.ll b/llvm/test/CodeGen/X86/avx512-arith.ll index 62dece137cc0a6d51f4ce9005c4d924084e4b468..5e71fffff022818beac6f045b5819d26ff899fe6 100644 --- a/llvm/test/CodeGen/X86/avx512-arith.ll +++ b/llvm/test/CodeGen/X86/avx512-arith.ll @@ -891,7 +891,7 @@ define <8 x double> @test_mask_broadcast_vaddpd(<8 x double> %dst, <8 x double> ; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 ; CHECK-NEXT: vpcmpneqq %zmm0, %zmm2, %k1 ; CHECK-NEXT: vaddpd (%rdi){1to8}, %zmm1, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovapd %zmm1, %zmm0 ; CHECK-NEXT: retq double* %j, <8 x i64> %mask1) nounwind { %mask = icmp ne <8 x i64> %mask1, zeroinitializer @@ -962,30 +962,10 @@ define <8 x float> @test_fxor_8f32(<8 x float> %a) { } define <8 x double> @fabs_v8f64(<8 x double> %p) -; AVX512F-LABEL: fabs_v8f64: -; AVX512F: ## BB#0: -; AVX512F-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: fabs_v8f64: -; AVX512VL: ## BB#0: -; AVX512VL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VL-NEXT: retq -; -; AVX512BW-LABEL: fabs_v8f64: -; AVX512BW: ## BB#0: -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: retq -; -; AVX512DQ-LABEL: fabs_v8f64: -; AVX512DQ: ## BB#0: -; AVX512DQ-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0 -; AVX512DQ-NEXT: retq -; -; SKX-LABEL: fabs_v8f64: -; SKX: ## BB#0: -; SKX-NEXT: vandpd {{.*}}(%rip), %zmm0, %zmm0 -; SKX-NEXT: retq +; CHECK-LABEL: fabs_v8f64: +; CHECK: ## BB#0: +; CHECK-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 +; CHECK-NEXT: retq { %t = call <8 x double> @llvm.fabs.v8f64(<8 x double> %p) ret <8 x double> %t diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll index d024475274b48159becd45f525609e196310e32b..1cf1c076796fc467de6b52cd62c3fbcdf68e2f5e 100644 --- a/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-25270.ll @@ -10,7 +10,7 @@ define void @bar__512(<16 x i32>* %var) #0 { ; CHECK-NEXT: subq $112, %rsp ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: vmovdqu32 (%rbx), %zmm0 -; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill +; CHECK-NEXT: vmovdqu64 %zmm0, (%rsp) ## 64-byte Spill ; CHECK-NEXT: vpbroadcastd {{.*}}(%rip), %zmm1 ; CHECK-NEXT: vmovdqa32 %zmm1, (%rbx) ; CHECK-NEXT: callq _Print__512 diff --git a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll index b3e1b17076bb33552339d8b75b0a05855fd362e7..b15d28a649b39a85f8b26b6b0f62331992d185eb 100644 --- a/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll +++ b/llvm/test/CodeGen/X86/avx512-bugfix-26264.ll @@ -13,10 +13,10 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 ; AVX512BW-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} ; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 ; AVX512BW-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 -; AVX512BW-NEXT: vmovaps %zmm2, %zmm1 -; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 -; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 +; AVX512BW-NEXT: vmovapd %zmm1, %zmm0 +; AVX512BW-NEXT: vmovapd %zmm2, %zmm1 +; AVX512BW-NEXT: vmovapd %zmm3, %zmm2 +; AVX512BW-NEXT: vmovapd %zmm4, %zmm3 ; AVX512BW-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res @@ -34,10 +34,10 @@ define <32 x i64> @test_load_32i64(<32 x i64>* %ptrs, <32 x i1> %mask, <32 x i64 ; AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} ; AVX512BW-NEXT: kshiftrw $8, %k2, %k1 ; AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm4 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 -; AVX512BW-NEXT: vmovaps %zmm2, %zmm1 -; AVX512BW-NEXT: vmovaps %zmm3, %zmm2 -; AVX512BW-NEXT: vmovaps %zmm4, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm2 +; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 ; AVX512BW-NEXT: retq %res = call <32 x i64> @llvm.masked.load.v32i64.p0v32i64(<32 x i64>* %ptrs, i32 4, <32 x i1> %mask, <32 x i64> %src0) ret <32 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512-ext.ll b/llvm/test/CodeGen/X86/avx512-ext.ll index faac7b20fd61814adaa336a6ad8c7608f0726a92..e39f303fef9c010615f9ff8615a1c59c9a8f42fe 100644 --- a/llvm/test/CodeGen/X86/avx512-ext.ll +++ b/llvm/test/CodeGen/X86/avx512-ext.ll @@ -163,7 +163,7 @@ define <32 x i16> @zext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1 -; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32x8mem_to_32x16: @@ -192,7 +192,7 @@ define <32 x i16> @sext_32x8mem_to_32x16(<32 x i8> *%i , <32 x i1> %mask) nounwi ; KNL-NEXT: vpsllw $15, %ymm0, %ymm0 ; KNL-NEXT: vpsraw $15, %ymm0, %ymm0 ; KNL-NEXT: vpand %ymm1, %ymm0, %ymm1 -; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_32x8mem_to_32x16: @@ -213,7 +213,7 @@ define <32 x i16> @zext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: zext_32x8_to_32x16: @@ -258,7 +258,7 @@ define <32 x i16> @sext_32x8_to_32x16(<32 x i8> %a ) nounwind readnone { ; KNL-NEXT: vpmovsxbw %xmm0, %ymm2 ; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; KNL-NEXT: vpmovsxbw %xmm0, %ymm1 -; KNL-NEXT: vmovaps %zmm2, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: sext_32x8_to_32x16: diff --git a/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll index d8026cd987c2e4a67469253208fc6ed98e54c2d0..f52430d2f22f742fe80d7bd20dc45c14452c33f5 100644 --- a/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll @@ -129,7 +129,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_512(<8 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -146,7 +146,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_512(<8 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 ; CHECK-NEXT: vfmaddsub231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -163,7 +163,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_512(<8 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfmaddsub213pd %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vfmaddsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -229,7 +229,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_512(<8 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 ; CHECK-NEXT: vfmsubadd231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfmsubadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -358,7 +358,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmsub_pd_512(<8 x double> %x0, <8 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 ; CHECK-NEXT: vfmsub231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -485,7 +485,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfmadd_pd_512(<8 x double> %x0, <8 ; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -502,7 +502,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfmadd_pd_512(<8 x double> %x0, <8 ; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 ; CHECK-NEXT: vfmadd231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -519,7 +519,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vfmadd_pd_512(<8 x double> %x0, <8 ; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfmadd213pd %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vfmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -679,7 +679,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmsub_pd_512(<8 x double> %x0, <8 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfnmsub213pd %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -696,7 +696,7 @@ define <8 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_512(<8 x double> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vmovapd %zmm2, %zmm3 ; CHECK-NEXT: vfnmsub231pd %zmm1, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vfnmsub213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 @@ -743,7 +743,7 @@ define <8 x double>@test_int_x86_avx512_mask_vfnmadd_pd_512(<8 x double> %x0, <8 ; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovapd %zmm0, %zmm3 ; CHECK-NEXT: vfnmadd213pd %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vfnmadd213pd {rn-sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index 2c42aca33e45a1208596325865678bd42b52cae5..21de1aca43e719bf66302a2694393e42f51fd4fc 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -204,7 +204,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { ; KNL-NEXT: testb %al, %al ; KNL-NEXT: je LBB10_2 ; KNL-NEXT: ## BB#1: ## %A -; KNL-NEXT: vmovaps %zmm1, %zmm0 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL-NEXT: retq ; KNL-NEXT: LBB10_2: ## %B ; KNL-NEXT: vpaddd %zmm0, %zmm1, %zmm0 @@ -219,7 +219,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) { ; SKX-NEXT: testb %al, %al ; SKX-NEXT: je LBB10_2 ; SKX-NEXT: ## BB#1: ## %A -; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX-NEXT: retq ; SKX-NEXT: LBB10_2: ## %B ; SKX-NEXT: vpaddd %zmm0, %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 7d0535546dfae9f7b345355d91ad8c6cc86b3b97..22a07c0734188ab7e1ffe82dd187e548d5a50a32 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -745,7 +745,7 @@ define <16 x i32> @test_x86_avx512_mask_pslli_d(<16 x i32> %a0, <16 x i32> %a1, ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpslld $7, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pslli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) ret <16 x i32> %res @@ -777,7 +777,7 @@ define <8 x i64> @test_x86_avx512_mask_pslli_q(<8 x i64> %a0, <8 x i64> %a1, i8 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllq $7, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pslli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) ret <8 x i64> %res @@ -809,7 +809,7 @@ define <16 x i32> @test_x86_avx512_mask_psrli_d(<16 x i32> %a0, <16 x i32> %a1, ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrld $7, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrli.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) ret <16 x i32> %res @@ -841,7 +841,7 @@ define <8 x i64> @test_x86_avx512_mask_psrli_q(<8 x i64> %a0, <8 x i64> %a1, i8 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlq $7, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrli.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) ret <8 x i64> %res @@ -873,7 +873,7 @@ define <16 x i32> @test_x86_avx512_mask_psrai_d(<16 x i32> %a0, <16 x i32> %a1, ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrad $7, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrai.d(<16 x i32> %a0, i32 7, <16 x i32> %a1, i16 %mask) ret <16 x i32> %res @@ -905,7 +905,7 @@ define <8 x i64> @test_x86_avx512_mask_psrai_q(<8 x i64> %a0, <8 x i64> %a1, i8 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraq $7, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrai.q(<8 x i64> %a0, i32 7, <8 x i64> %a1, i8 %mask) ret <8 x i64> %res @@ -928,7 +928,7 @@ declare void @llvm.x86.avx512.storent.q.512(i8*, <8 x i64>) define void@test_storent_q_512(<8 x i64> %data, i8* %ptr) { ; CHECK-LABEL: test_storent_q_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovntdq %zmm0, (%rdi) +; CHECK-NEXT: vmovntps %zmm0, (%rdi) ; CHECK-NEXT: retq call void @llvm.x86.avx512.storent.q.512(i8* %ptr, <8 x i64> %data) ret void @@ -939,7 +939,7 @@ declare void @llvm.x86.avx512.storent.pd.512(i8*, <8 x double>) define void @test_storent_pd_512(<8 x double> %data, i8* %ptr) { ; CHECK-LABEL: test_storent_pd_512: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovntpd %zmm0, (%rdi) +; CHECK-NEXT: vmovntps %zmm0, (%rdi) ; CHECK-NEXT: retq call void @llvm.x86.avx512.storent.pd.512(i8* %ptr, <8 x double> %data) ret void @@ -970,7 +970,7 @@ define <16 x i32> @test_mask_xor_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> % ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpxord %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pxor.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) ret < 16 x i32> %res @@ -992,7 +992,7 @@ define <16 x i32> @test_mask_or_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> %p ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.por.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) ret < 16 x i32> %res @@ -1014,7 +1014,7 @@ define <16 x i32> @test_mask_and_epi32(<16 x i32> %a,<16 x i32> %b, <16 x i32> % ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpandd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) ret < 16 x i32> %res @@ -1025,7 +1025,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pand.d.512(<16 x i32>, <16 x i32>, <16 define <8 x i64> @test_xor_epi64(<8 x i64> %a, <8 x i64> %b) { ; CHECK-LABEL: test_xor_epi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) ret < 8 x i64> %res @@ -1036,7 +1036,7 @@ define <8 x i64> @test_mask_xor_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %pass ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) ret < 8 x i64> %res @@ -1047,7 +1047,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.pxor.q.512(<8 x i64>, <8 x i64>, <8 x i6 define <8 x i64> @test_or_epi64(<8 x i64> %a, <8 x i64> %b) { ; CHECK-LABEL: test_or_epi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vorps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) ret < 8 x i64> %res @@ -1058,7 +1058,7 @@ define <8 x i64> @test_mask_or_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %passT ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vporq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) ret < 8 x i64> %res @@ -1069,7 +1069,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.por.q.512(<8 x i64>, <8 x i64>, <8 x i64 define <8 x i64> @test_and_epi64(<8 x i64> %a, <8 x i64> %b) { ; CHECK-LABEL: test_and_epi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vandps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a,<8 x i64> %b, <8 x i64>zeroinitializer, i8 -1) ret < 8 x i64> %res @@ -1080,7 +1080,7 @@ define <8 x i64> @test_mask_and_epi64(<8 x i64> %a,<8 x i64> %b, <8 x i64> %pass ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpandq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pand.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) ret < 8 x i64> %res diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 65ed77374388bfa1de8982594d1ce042fa0c84fd..cab1aae11424e79b82a0e28f3ac1f78a965ac3e7 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -679,7 +679,7 @@ define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpconflictq %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) ret <8 x i64> %res @@ -713,7 +713,7 @@ define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask) ret <16 x i32> %res @@ -724,7 +724,7 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) { ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntq %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask) ret <8 x i64> %res @@ -861,7 +861,7 @@ define <8 x i64> @test_mask_valign_q(<8 x i64> %a, <8 x i64> %b, <8 x i64> %src, ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: valignq $2, %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.valign.q.512(<8 x i64> %a, <8 x i64> %b, i32 2, <8 x i64> %src, i8 %mask) ret <8 x i64> %res @@ -1342,7 +1342,7 @@ define <16 x i32> @test_x86_avx512_mask_psll_d(<16 x i32> %a0, <4 x i32> %a1, <1 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpslld %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psll.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) ret <16 x i32> %res @@ -1374,7 +1374,7 @@ define <8 x i64> @test_x86_avx512_mask_psll_q(<8 x i64> %a0, <2 x i64> %a1, <8 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllq %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psll.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) ret <8 x i64> %res @@ -1406,7 +1406,7 @@ define <16 x i32> @test_x86_avx512_mask_psrl_d(<16 x i32> %a0, <4 x i32> %a1, <1 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrld %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) ret <16 x i32> %res @@ -1438,7 +1438,7 @@ define <8 x i64> @test_x86_avx512_mask_psrl_q(<8 x i64> %a0, <2 x i64> %a1, <8 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlq %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) ret <8 x i64> %res @@ -1470,7 +1470,7 @@ define <16 x i32> @test_x86_avx512_mask_psra_d(<16 x i32> %a0, <4 x i32> %a1, <1 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrad %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psra.d(<16 x i32> %a0, <4 x i32> %a1, <16 x i32> %a2, i16 %mask) ret <16 x i32> %res @@ -1502,7 +1502,7 @@ define <8 x i64> @test_x86_avx512_mask_psra_q(<8 x i64> %a0, <2 x i64> %a1, <8 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsraq %xmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psra.q(<8 x i64> %a0, <2 x i64> %a1, <8 x i64> %a2, i8 %mask) ret <8 x i64> %res @@ -1534,7 +1534,7 @@ define <16 x i32> @test_x86_avx512_mask_psllv_d(<16 x i32> %a0, <16 x i32> %a1, ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psllv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) ret <16 x i32> %res @@ -1566,7 +1566,7 @@ define <8 x i64> @test_x86_avx512_mask_psllv_q(<8 x i64> %a0, <8 x i64> %a1, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsllvq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psllv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) ret <8 x i64> %res @@ -1599,7 +1599,7 @@ define <16 x i32> @test_x86_avx512_mask_psrav_d(<16 x i32> %a0, <16 x i32> %a1, ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrav.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) ret <16 x i32> %res @@ -1631,7 +1631,7 @@ define <8 x i64> @test_x86_avx512_mask_psrav_q(<8 x i64> %a0, <8 x i64> %a1, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsravq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrav.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) ret <8 x i64> %res @@ -1663,7 +1663,7 @@ define <16 x i32> @test_x86_avx512_mask_psrlv_d(<16 x i32> %a0, <16 x i32> %a1, ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrlv.d(<16 x i32> %a0, <16 x i32> %a1, <16 x i32> %a2, i16 %mask) ret <16 x i32> %res @@ -1695,7 +1695,7 @@ define <8 x i64> @test_x86_avx512_mask_psrlv_q(<8 x i64> %a0, <8 x i64> %a1, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsrlvq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrlv.q(<8 x i64> %a0, <8 x i64> %a1, <8 x i64> %a2, i8 %mask) ret <8 x i64> %res @@ -1960,7 +1960,7 @@ define <16 x i32> @test_mask_add_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) ret < 16 x i32> %res @@ -1991,7 +1991,7 @@ define <16 x i32> @test_mask_add_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <1 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpaddd (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <16 x i32> @llvm.x86.avx512.mask.padd.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) @@ -2026,7 +2026,7 @@ define <16 x i32> @test_mask_add_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i3 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpaddd (%rdi){1to16}, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 @@ -2064,7 +2064,7 @@ define <16 x i32> @test_mask_sub_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <16 x i ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) ret < 16 x i32> %res @@ -2095,7 +2095,7 @@ define <16 x i32> @test_mask_sub_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <1 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsubd (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <16 x i32> @llvm.x86.avx512.mask.psub.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) @@ -2130,7 +2130,7 @@ define <16 x i32> @test_mask_sub_epi32_rmbk(<16 x i32> %a, i32* %ptr_b, <16 x i3 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsubd (%rdi){1to16}, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 @@ -2168,7 +2168,7 @@ define <8 x i64> @test_mask_add_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) ret < 8 x i64> %res @@ -2199,7 +2199,7 @@ define <8 x i64> @test_mask_add_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpaddq (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %b = load <8 x i64>, <8 x i64>* %ptr_b %res = call <8 x i64> @llvm.x86.avx512.mask.padd.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) @@ -2234,7 +2234,7 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpaddq (%rdi){1to8}, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %q = load i64, i64* %ptr_b %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 @@ -2272,7 +2272,7 @@ define <8 x i64> @test_mask_sub_epi64_rrk(<8 x i64> %a, <8 x i64> %b, <8 x i64> ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpsubq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) ret < 8 x i64> %res @@ -2303,7 +2303,7 @@ define <8 x i64> @test_mask_sub_epi64_rmk(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsubq (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %b = load <8 x i64>, <8 x i64>* %ptr_b %res = call <8 x i64> @llvm.x86.avx512.mask.psub.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) @@ -2338,7 +2338,7 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsubq (%rdi){1to8}, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %q = load i64, i64* %ptr_b %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 @@ -2376,7 +2376,7 @@ define <8 x i64> @test_mask_mul_epi32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmuldq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) ret < 8 x i64> %res @@ -2407,7 +2407,7 @@ define <8 x i64> @test_mask_mul_epi32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuldq (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <8 x i64> @llvm.x86.avx512.mask.pmul.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) @@ -2443,7 +2443,7 @@ define <8 x i64> @test_mask_mul_epi32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuldq (%rdi){1to8}, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %q = load i64, i64* %ptr_b %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 @@ -2483,7 +2483,7 @@ define <8 x i64> @test_mask_mul_epu32_rrk(<16 x i32> %a, <16 x i32> %b, <8 x i64 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmuludq %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) ret < 8 x i64> %res @@ -2514,7 +2514,7 @@ define <8 x i64> @test_mask_mul_epu32_rmk(<16 x i32> %a, <16 x i32>* %ptr_b, <8 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuludq (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a, <16 x i32> %b, <8 x i64> %passThru, i8 %mask) @@ -2550,7 +2550,7 @@ define <8 x i64> @test_mask_mul_epu32_rmbk(<16 x i32> %a, i64* %ptr_b, <8 x i64> ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmuludq (%rdi){1to8}, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %q = load i64, i64* %ptr_b %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 @@ -2590,7 +2590,7 @@ define <16 x i32> @test_mask_mullo_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, < ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpmulld %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) ret < 16 x i32> %res @@ -2621,7 +2621,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmulld (%rdi), %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passThru, i16 %mask) @@ -2656,7 +2656,7 @@ define <16 x i32> @test_mask_mullo_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <1 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpmulld (%rdi){1to16}, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 @@ -3285,7 +3285,7 @@ define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovapd %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 0) ret <2 x double> %res @@ -3297,7 +3297,7 @@ define <2 x double> @test_mask_add_sd_rd(<2 x double> %a0, <2 x double> %a1, <2 ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovapd %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) ret <2 x double> %res @@ -3309,7 +3309,7 @@ define <2 x double> @test_mask_add_sd_ru(<2 x double> %a0, <2 x double> %a1, <2 ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {ru-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovapd %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 2) ret <2 x double> %res @@ -3321,7 +3321,7 @@ define <2 x double> @test_mask_add_sd_rz(<2 x double> %a0, <2 x double> %a1, <2 ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd {rz-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovapd %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 3) ret <2 x double> %res @@ -3333,7 +3333,7 @@ define <2 x double> @test_mask_add_sd_current(<2 x double> %a0, <2 x double> %a1 ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vaddsd %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovapd %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) ret <2 x double> %res @@ -3432,7 +3432,7 @@ define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovapd %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) ret <2 x double> %res @@ -3464,7 +3464,7 @@ define <2 x double> @test_mask_max_sd(<2 x double> %a0, <2 x double> %a1, <2 x d ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmaxsd %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vmovapd %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) ret <2 x double> %res @@ -3788,7 +3788,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpermi2d %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm3, %zmm0 @@ -3806,7 +3806,7 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vaddpd %zmm1, %zmm3, %zmm0 @@ -3823,7 +3823,7 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 @@ -3840,7 +3840,7 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vpaddq %zmm1, %zmm3, %zmm0 @@ -3857,7 +3857,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm2 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm2 ; CHECK-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z} ; CHECK-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm0 @@ -3875,7 +3875,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm2 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm2 ; CHECK-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} ; CHECK-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm0 @@ -3895,7 +3895,7 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 {%k1} {z} ; CHECK-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vaddps %zmm1, %zmm3, %zmm0 @@ -3913,7 +3913,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z} ; CHECK-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vpaddq %zmm1, %zmm3, %zmm0 @@ -3930,7 +3930,7 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 ; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vpaddd %zmm1, %zmm3, %zmm0 @@ -4956,7 +4956,7 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm2, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm3 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 @@ -5197,7 +5197,7 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0 @@ -5214,7 +5214,7 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm3, %zmm0 @@ -5231,7 +5231,7 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64 ; CHECK-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 @@ -5248,7 +5248,7 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-LABEL: test_int_x86_avx512_maskz_pternlog_q_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm3, %zmm0 @@ -5995,7 +5995,7 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} @@ -6017,10 +6017,10 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_pd_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, %zmm2, %zmm1, %zmm3 {%k1} {z} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 @@ -6041,10 +6041,10 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1} ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 ; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1 @@ -6065,9 +6065,9 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ss(<4 x float> %x0, <4 x f ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm3 {%k1} {z} -; CHECK-NEXT: vmovaps %zmm0, %zmm4 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 ; CHECK-NEXT: vfixupimmss $5, %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} @@ -6088,10 +6088,10 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, < ; CHECK-LABEL: test_int_x86_avx512_mask_fixupimm_ps_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 ; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 @@ -6111,9 +6111,9 @@ define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0, ; CHECK-LABEL: test_int_x86_avx512_maskz_fixupimm_ps_512: ; CHECK: ## BB#0: ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z} -; CHECK-NEXT: vmovaps %zmm0, %zmm4 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 ; CHECK-NEXT: vfixupimmps $5, %zmm2, %zmm1, %zmm4 ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z} @@ -6135,9 +6135,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_sd(<2 x double> %x0, <2 x ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} -; CHECK-NEXT: vmovaps %zmm0, %zmm4 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm4 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm4 ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} @@ -6159,10 +6159,10 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x ; CHECK: ## BB#0: ; CHECK-NEXT: andl $1, %edi ; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %zmm0, %zmm3 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm3 ; CHECK-NEXT: vfixupimmsd $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; CHECK-NEXT: vmovaps %zmm0, %zmm5 +; CHECK-NEXT: vmovdqa64 %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z} ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/avx512-logic.ll b/llvm/test/CodeGen/X86/avx512-logic.ll index d085467868ab82941661569c8e9d2522b3dac7ec..c8d3b51942558c2b705876f65a5de20a26c06d11 100644 --- a/llvm/test/CodeGen/X86/avx512-logic.ll +++ b/llvm/test/CodeGen/X86/avx512-logic.ll @@ -157,7 +157,7 @@ define <64 x i8> @and_v64i8(<64 x i8> %a, <64 x i8> %b) { ; ; SKX-LABEL: and_v64i8: ; SKX: ## BB#0: -; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %res = and <64 x i8> %a, %b ret <64 x i8> %res @@ -172,7 +172,7 @@ define <64 x i8> @andn_v64i8(<64 x i8> %a, <64 x i8> %b) { ; ; SKX-LABEL: andn_v64i8: ; SKX: ## BB#0: -; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %b2 = xor <64 x i8> %b, @or_v64i8(<64 x i8> %a, <64 x i8> %b) { ; ; SKX-LABEL: or_v64i8: ; SKX: ## BB#0: -; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %res = or <64 x i8> %a, %b ret <64 x i8> %res @@ -206,7 +206,7 @@ define <64 x i8> @xor_v64i8(<64 x i8> %a, <64 x i8> %b) { ; ; SKX-LABEL: xor_v64i8: ; SKX: ## BB#0: -; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %res = xor <64 x i8> %a, %b ret <64 x i8> %res @@ -221,7 +221,7 @@ define <32 x i16> @and_v32i16(<32 x i16> %a, <32 x i16> %b) { ; ; SKX-LABEL: and_v32i16: ; SKX: ## BB#0: -; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vandps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %res = and <32 x i16> %a, %b ret <32 x i16> %res @@ -236,7 +236,7 @@ define <32 x i16> @andn_v32i16(<32 x i16> %a, <32 x i16> %b) { ; ; SKX-LABEL: andn_v32i16: ; SKX: ## BB#0: -; SKX-NEXT: vpandnq %zmm0, %zmm1, %zmm0 +; SKX-NEXT: vandnps %zmm0, %zmm1, %zmm0 ; SKX-NEXT: retq %b2 = xor <32 x i16> %b, @@ -253,7 +253,7 @@ define <32 x i16> @or_v32i16(<32 x i16> %a, <32 x i16> %b) { ; ; SKX-LABEL: or_v32i16: ; SKX: ## BB#0: -; SKX-NEXT: vporq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vorps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %res = or <32 x i16> %a, %b ret <32 x i16> %res @@ -268,7 +268,7 @@ define <32 x i16> @xor_v32i16(<32 x i16> %a, <32 x i16> %b) { ; ; SKX-LABEL: xor_v32i16: ; SKX: ## BB#0: -; SKX-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vxorps %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %res = xor <32 x i16> %a, %b ret <32 x i16> %res diff --git a/llvm/test/CodeGen/X86/avx512-mov.ll b/llvm/test/CodeGen/X86/avx512-mov.ll index 6b07e9e704db446f3f450af073d71f1491aebcb0..7c5c028f060918e59cd9d6e84daa59e4017497d3 100644 --- a/llvm/test/CodeGen/X86/avx512-mov.ll +++ b/llvm/test/CodeGen/X86/avx512-mov.ll @@ -231,7 +231,7 @@ define <8 x i64> @test23(i8 * %addr) { define void @test24(i8 * %addr, <8 x double> %data) { ; CHECK-LABEL: test24: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovapd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x29,0x07] +; CHECK-NEXT: vmovaps %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x29,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 64 @@ -241,7 +241,7 @@ define void @test24(i8 * %addr, <8 x double> %data) { define <8 x double> @test25(i8 * %addr) { ; CHECK-LABEL: test25: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovapd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0x07] +; CHECK-NEXT: vmovaps (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 64 @@ -271,7 +271,7 @@ define <16 x float> @test27(i8 * %addr) { define void @test28(i8 * %addr, <8 x double> %data) { ; CHECK-LABEL: test28: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovupd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x07] +; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* store <8 x double>%data, <8 x double>* %vaddr, align 1 @@ -281,7 +281,7 @@ define void @test28(i8 * %addr, <8 x double> %data) { define <8 x double> @test29(i8 * %addr) { ; CHECK-LABEL: test29: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %vaddr = bitcast i8* %addr to <8 x double>* %res = load <8 x double>, <8 x double>* %vaddr, align 1 diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index 2ac91cc7482ada0f9aa0f71012d80ec4129b94c9..fab6b3dda2424153f33b089e4233952ba69e9850 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -27,7 +27,7 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind { ; CHECK-NEXT: ## BB#1: ; CHECK-NEXT: vmovaps %zmm0, %zmm1 ; CHECK-NEXT: LBB1_2: -; CHECK-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vxorps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %cmpres = icmp eq i32 %a, 255 %selres = select i1 %cmpres, <8 x i64> zeroinitializer, <8 x i64> %b diff --git a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll index 005dc23ccf7b39589f57b75c9a31bc6d96109ebd..d7660e0b4eaeb2f301fcf401c5afa3a956c506d6 100644 --- a/llvm/test/CodeGen/X86/avx512-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx512-vbroadcast.ll @@ -359,7 +359,7 @@ define <64 x i8> @_invec32xi8(<32 x i8>%a) { ; AVX512F-LABEL: _invec32xi8: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpbroadcastb %xmm0, %ymm0 -; AVX512F-NEXT: vmovaps %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: _invec32xi8: @@ -374,7 +374,7 @@ define <32 x i16> @_invec16xi16(<16 x i16>%a) { ; AVX512F-LABEL: _invec16xi16: ; AVX512F: # BB#0: ; AVX512F-NEXT: vpbroadcastw %xmm0, %ymm0 -; AVX512F-NEXT: vmovaps %zmm0, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 ; AVX512F-NEXT: retq ; ; AVX512BW-LABEL: _invec16xi16: diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index b131befcf0a22056286742818e64cbc26304848e..1fc7a1290542d67ff7c5ee2e176202ea5c5d45d8 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -669,14 +669,14 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, < ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpackssdw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -719,7 +719,7 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpackssdw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512: @@ -727,7 +727,7 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpackssdw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) @@ -775,7 +775,7 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3 ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512: @@ -783,7 +783,7 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3 ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 @@ -833,7 +833,7 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6 ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovq %rdi, %k1 ; AVX512BW-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512: @@ -842,7 +842,7 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 ; AVX512F-32-NEXT: vpacksswb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) ret <64 x i8> %res @@ -887,7 +887,7 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovq %rsi, %k1 ; AVX512BW-NEXT: vpacksswb (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512: @@ -897,7 +897,7 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 ; AVX512F-32-NEXT: vpacksswb (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) @@ -946,14 +946,14 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpackusdw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -996,7 +996,7 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpackusdw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512: @@ -1004,7 +1004,7 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpackusdw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %b = load <16 x i32>, <16 x i32>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) @@ -1052,7 +1052,7 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, < ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512: @@ -1060,7 +1060,7 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, < ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %q = load i32, i32* %ptr_b %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0 @@ -1110,7 +1110,7 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, < ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovq %rdi, %k1 ; AVX512BW-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512: @@ -1119,7 +1119,7 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, < ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 ; AVX512F-32-NEXT: vpackuswb %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) ret <64 x i8> %res @@ -1164,7 +1164,7 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovq %rsi, %k1 ; AVX512BW-NEXT: vpackuswb (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512: @@ -1174,7 +1174,7 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: kunpckdq %k0, %k1, %k1 ; AVX512F-32-NEXT: vpackuswb (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) @@ -1222,14 +1222,14 @@ define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3 ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -1272,7 +1272,7 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: @@ -1280,7 +1280,7 @@ define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) @@ -1326,14 +1326,14 @@ define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3 ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -1376,7 +1376,7 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: @@ -1384,7 +1384,7 @@ define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) @@ -1430,14 +1430,14 @@ define <32 x i16> @test_mask_adds_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3 ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_adds_epu16_rrk_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpaddusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -1480,7 +1480,7 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpaddusw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_adds_epu16_rmk_512: @@ -1488,7 +1488,7 @@ define <32 x i16> @test_mask_adds_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpaddusw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.paddus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) @@ -1534,14 +1534,14 @@ define <32 x i16> @test_mask_subs_epu16_rrk_512(<32 x i16> %a, <32 x i16> %b, <3 ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovaps %zmm2, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_subs_epu16_rrk_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsubusw %zmm1, %zmm0, %zmm2 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm2, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -1584,7 +1584,7 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsubusw (%rdi), %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vmovaps %zmm1, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_mask_subs_epu16_rmk_512: @@ -1592,7 +1592,7 @@ define <32 x i16> @test_mask_subs_epu16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_ ; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsubusw (%eax), %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) @@ -1825,7 +1825,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 ; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovaps %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0 @@ -1834,7 +1834,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm3 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} ; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 ; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0 @@ -1851,7 +1851,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3 ; AVX512BW-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovaps %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z} ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0 @@ -1860,7 +1860,7 @@ define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <3 ; AVX512F-32-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm3 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 {%k1} {z} ; AVX512F-32-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 ; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0 @@ -1877,7 +1877,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 ; AVX512BW-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; AVX512BW: ## BB#0: ; AVX512BW-NEXT: kmovd %edi, %k1 -; AVX512BW-NEXT: vmovaps %zmm1, %zmm3 +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1} ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vpaddw %zmm1, %zmm3, %zmm0 @@ -1886,7 +1886,7 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 ; AVX512F-32-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm3 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm3 ; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm3 {%k1} ; AVX512F-32-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 ; AVX512F-32-NEXT: vpaddw %zmm1, %zmm3, %zmm0 diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index cf1b15e804f27273e7e22f3f9a99c7e14090133e..f69bdc833468a33bd64428ba007832fdced45ffd 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -2269,7 +2269,7 @@ define <32 x i16> @test_mask_add_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0xd1] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -2300,7 +2300,7 @@ define <32 x i16> @test_mask_add_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpaddw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xfd,0x0f] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.padd.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) @@ -2334,7 +2334,7 @@ define <32 x i16> @test_mask_sub_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0xd1] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -2365,7 +2365,7 @@ define <32 x i16> @test_mask_sub_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsubw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xf9,0x0f] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.psub.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) @@ -2399,7 +2399,7 @@ define <32 x i16> @test_mask_mullo_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, < ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpmullw %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0xd1] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) ret <32 x i16> %res @@ -2430,7 +2430,7 @@ define <32 x i16> @test_mask_mullo_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr ; CHECK: ## BB#0: ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpmullw (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0xd5,0x0f] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %b = load <32 x i16>, <32 x i16>* %ptr_b %res = call <32 x i16> @llvm.x86.avx512.mask.pmull.w.512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll index f201082fb1a79823152e7e6058554f2e79bca5c1..0680290b0401fa816f6e135fab337d43b62fa955 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -15,7 +15,7 @@ define <8 x i64> @test_mask_mullo_epi64_rrk_512(<8 x i64> %a, <8 x i64> %b, <8 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf] ; CHECK-NEXT: vpmullq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0xd1] -; CHECK-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] +; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) ret <8 x i64> %res @@ -46,7 +46,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmk_512(<8 x i64> %a, <8 x i64>* %ptr_b, ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce] ; CHECK-NEXT: vpmullq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0x0f] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %b = load <8 x i64>, <8 x i64>* %ptr_b %res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) @@ -81,7 +81,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x ; CHECK: ## BB#0: ; CHECK-NEXT: kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce] ; CHECK-NEXT: vpmullq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x40,0x0f] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %q = load i64, i64* %ptr_b %vecinit.i = insertelement <8 x i64> undef, i64 %q, i32 0 diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll index ce999855d1f1d28d365f7d57a0298bf286ca85f4..afdd8c34ad40996e24bf75a0b6d9842f6317c7a0 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll @@ -24,11 +24,14 @@ declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>, define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: -; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} -; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm3 {%k1} {z} -; CHECK: vpmultishiftqb %zmm1, %zmm0, %zmm0 -; CHECK: vpaddb %zmm3, %zmm2, %zmm1 -; CHECK: vpaddb %zmm0, %zmm1, %zmm0 +; CHECK: ## BB#0: +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 {%k1} {z} +; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddb %zmm3, %zmm2, %zmm1 +; CHECK-NEXT: vpaddb %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) %res2 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) @@ -42,15 +45,15 @@ declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1} -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} -; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0 -; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} +; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0 +; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3) %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) @@ -64,15 +67,15 @@ declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm3 -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1} -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 -; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} -; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0 -; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm3 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 {%k1} +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} +; CHECK-NEXT: vpaddb %zmm4, %zmm3, %zmm0 +; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3) %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) @@ -86,10 +89,10 @@ declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>, define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_512: ; CHECK: ## BB#0: -; CHECK-NEXT: kmovq %rdi, %k1 -; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 -; CHECK-NEXT: retq +; CHECK-NEXT: kmovq %rdi, %k1 +; CHECK-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} {z} +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 +; CHECK-NEXT: retq %res = call <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) ret <64 x i8> %res } diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index f9126b4614eb9d5318b0057241ee34a5ca430edc..9126e659774300a7e907a2740a621ae035736211 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -2330,7 +2330,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) { ; CHECK-LABEL: test_mask_andnot_epi64_rr_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandnq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0xc1] +; CHECK-NEXT: vandnps %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1) ret <2 x i64> %res @@ -2360,7 +2360,7 @@ define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi64_rm_128: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandnq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0x07] +; CHECK-NEXT: vandnps (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7c,0x08,0x55,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %b = load <2 x i64>, <2 x i64>* %ptr_b %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1) @@ -2434,7 +2434,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) { ; CHECK-LABEL: test_mask_andnot_epi64_rr_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandnq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0xc1] +; CHECK-NEXT: vandnps %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1) ret <4 x i64> %res @@ -2464,7 +2464,7 @@ define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) { ; CHECK-LABEL: test_mask_andnot_epi64_rm_256: ; CHECK: ## BB#0: -; CHECK-NEXT: vpandnq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0x07] +; CHECK-NEXT: vandnps (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7c,0x28,0x55,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %b = load <4 x i64>, <4 x i64>* %ptr_b %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 41376cf602c4aa019a59e83410e0f8cbdc7e02dd..88747c935be4bb44f146f836a7e5e5e2630f44a1 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -857,7 +857,7 @@ declare <4 x float> @llvm.x86.avx512.mask.compress.ps.128(<4 x float> %data, <4 define void @compr7(i8* %addr, <8 x double> %data) { ; CHECK-LABEL: compr7: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovupd %zmm0, (%rdi) ## encoding: [0x62,0xf1,0xfd,0x48,0x11,0x07] +; CHECK-NEXT: vmovups %zmm0, (%rdi) ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] call void @llvm.x86.avx512.mask.compress.store.pd.512(i8* %addr, <8 x double> %data, i8 -1) ret void @@ -973,7 +973,7 @@ declare <4 x float> @llvm.x86.avx512.mask.expand.ps.128(<4 x float> %data, <4 x define <8 x double> @expand7(i8* %addr, <8 x double> %data) { ; CHECK-LABEL: expand7: ; CHECK: ## BB#0: -; CHECK-NEXT: vmovupd (%rdi), %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x10,0x07] +; CHECK-NEXT: vmovups (%rdi), %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x07] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.expand.load.pd.512(i8* %addr, <8 x double> %data, i8 -1) ret <8 x double> %res diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll index 7b6509ad51c799bf12c33cce49b3d3737e44330e..bf9291c7119b382c894e9d1f41741e16cccec446 100644 --- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll +++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll @@ -244,7 +244,7 @@ define <8 x double> @test_8f64_fmsub_load(<8 x double>* %a0, <8 x double> %a1, < ; AVX512: # BB#0: ; AVX512-NEXT: vmovapd (%rdi), %zmm2 ; AVX512-NEXT: vfmsub213pd %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vmovaps %zmm2, %zmm0 +; AVX512-NEXT: vmovapd %zmm2, %zmm0 ; AVX512-NEXT: retq %x = load <8 x double>, <8 x double>* %a0 %y = fmul <8 x double> %x, %a1 @@ -573,7 +573,7 @@ define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x do ; ; AVX512-LABEL: test_v8f64_interp: ; AVX512: # BB#0: -; AVX512-NEXT: vmovaps %zmm2, %zmm3 +; AVX512-NEXT: vmovapd %zmm2, %zmm3 ; AVX512-NEXT: vfnmadd213pd %zmm1, %zmm1, %zmm3 ; AVX512-NEXT: vfmadd213pd %zmm3, %zmm2, %zmm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 3b748eeb2e5a25bd77e4794c24ef4df3dafb2ba7..c2407bd4248f81f9831867b414b09e128342a169 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -103,7 +103,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { ; KNL_64: # BB#0: ; KNL_64-NEXT: kmovw %esi, %k1 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test3: @@ -111,14 +111,14 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) { ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} -; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test3: ; SKX: # BB#0: ; SKX-NEXT: kmovw %esi, %k1 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX-NEXT: retq %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0 @@ -138,7 +138,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { ; KNL_64-NEXT: kmovw %esi, %k1 ; KNL_64-NEXT: kmovw %k1, %k2 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} -; KNL_64-NEXT: vmovaps %zmm1, %zmm2 +; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2 ; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} ; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0 ; KNL_64-NEXT: retq @@ -149,7 +149,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; KNL_32-NEXT: kmovw %k1, %k2 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} -; KNL_32-NEXT: vmovaps %zmm1, %zmm2 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 ; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -159,7 +159,7 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) { ; SKX-NEXT: kmovw %esi, %k1 ; SKX-NEXT: kmovw %k1, %k2 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} -; SKX-NEXT: vmovaps %zmm1, %zmm2 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm2 ; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} ; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0 ; SKX-NEXT: retq @@ -246,7 +246,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { ; KNL_64-NEXT: kxnorw %k0, %k0, %k2 ; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} ; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} -; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test6: @@ -256,7 +256,7 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) { ; KNL_32-NEXT: kxnorw %k0, %k0, %k2 ; KNL_32-NEXT: vpgatherqd (,%zmm2), %ymm1 {%k2} ; KNL_32-NEXT: vpscatterqd %ymm0, (,%zmm2) {%k1} -; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test6: @@ -282,7 +282,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { ; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_64-NEXT: kmovw %k1, %k2 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm1 {%k2} -; KNL_64-NEXT: vmovaps %zmm1, %zmm2 +; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2 ; KNL_64-NEXT: vpgatherqd (%rdi,%zmm0,4), %ymm2 {%k1} ; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 ; KNL_64-NEXT: retq @@ -295,7 +295,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) { ; KNL_32-NEXT: vpmovsxdq %ymm0, %zmm0 ; KNL_32-NEXT: kmovw %k1, %k2 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm1 {%k2} -; KNL_32-NEXT: vmovaps %zmm1, %zmm2 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 ; KNL_32-NEXT: vpgatherqd (%eax,%zmm0,4), %ymm2 {%k1} ; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 ; KNL_32-NEXT: retl @@ -344,7 +344,7 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { ; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; KNL_32-NEXT: kmovw %k1, %k2 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} -; KNL_32-NEXT: vmovaps %zmm1, %zmm2 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} ; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 ; KNL_32-NEXT: retl @@ -369,7 +369,7 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) { ; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; SKX_32-NEXT: kmovw %k1, %k2 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} -; SKX_32-NEXT: vmovaps %zmm1, %zmm2 +; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} ; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 ; SKX_32-NEXT: retl @@ -733,7 +733,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} -; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: vmovapd %zmm2, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test16: @@ -748,7 +748,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x ; KNL_32-NEXT: vpsllvq .LCPI15_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} -; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: vmovapd %zmm2, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test16: @@ -783,7 +783,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1} -; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: vmovapd %zmm2, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test17: @@ -794,7 +794,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x ; KNL_32-NEXT: vpsllvq .LCPI16_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1} -; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: vmovapd %zmm2, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test17: @@ -1080,7 +1080,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} -; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test23: @@ -1091,7 +1091,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> % ; KNL_32-NEXT: vpsllvq .LCPI22_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} -; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test23: @@ -1122,7 +1122,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_64: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} -; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test24: @@ -1133,7 +1133,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) { ; KNL_32-NEXT: vpsllvq .LCPI23_1, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} -; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test24: @@ -1165,7 +1165,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; KNL_64-NEXT: vpsllq $63, %zmm1, %zmm1 ; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1} -; KNL_64-NEXT: vmovaps %zmm2, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test25: @@ -1176,7 +1176,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> % ; KNL_32-NEXT: vpsllvq .LCPI24_0, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1} -; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test25: @@ -1208,7 +1208,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; KNL_64: movb $3, %al ; KNL_64-NEXT: kmovw %eax, %k1 ; KNL_64-NEXT: vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1} -; KNL_64-NEXT: vmovaps %zmm1, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test26: @@ -1219,7 +1219,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; KNL_32-NEXT: vpsllvq .LCPI25_1, %zmm2, %zmm2 ; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k1 ; KNL_32-NEXT: vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1} -; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: @@ -1546,15 +1546,15 @@ define <16 x float*> @test31(<16 x float**> %ptrs) { ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} ; KNL_64-NEXT: kshiftrw $8, %k1, %k1 ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} -; KNL_64-NEXT: vmovaps %zmm2, %zmm0 -; KNL_64-NEXT: vmovaps %zmm3, %zmm1 +; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm1 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test31: ; KNL_32: # BB#0: ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} -; KNL_32-NEXT: vmovaps %zmm1, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test31: @@ -1564,15 +1564,15 @@ define <16 x float*> @test31(<16 x float**> %ptrs) { ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2} ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1} -; SKX-NEXT: vmovaps %zmm2, %zmm0 -; SKX-NEXT: vmovaps %zmm3, %zmm1 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm3, %zmm1 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test31: ; SKX_32: # BB#0: ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} -; SKX_32-NEXT: vmovaps %zmm1, %zmm0 +; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX_32-NEXT: retl %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> , <16 x float*> undef) @@ -1598,7 +1598,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 ; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} -; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test_gather_16i32: @@ -1619,7 +1619,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i ; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 ; SKX_32-NEXT: vptestmd %zmm1, %zmm1, %k1 ; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} -; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; SKX_32-NEXT: retl %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) ret <16 x i32> %res @@ -1633,8 +1633,8 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} ; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} -; KNL_64-NEXT: vmovaps %zmm3, %zmm0 -; KNL_64-NEXT: vmovaps %zmm4, %zmm1 +; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0 +; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_gather_16i64: @@ -1657,7 +1657,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} -; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp ; KNL_32-NEXT: retl @@ -1670,8 +1670,8 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} ; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} -; SKX-NEXT: vmovaps %zmm3, %zmm0 -; SKX-NEXT: vmovaps %zmm4, %zmm1 +; SKX-NEXT: vmovdqa64 %zmm3, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm4, %zmm1 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_gather_16i64: @@ -1694,7 +1694,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 ; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} -; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0 ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp ; SKX_32-NEXT: retl @@ -1756,8 +1756,8 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; KNL_64-NEXT: kshiftrw $8, %k1, %k2 ; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} ; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} -; KNL_64-NEXT: vmovaps %zmm3, %zmm0 -; KNL_64-NEXT: vmovaps %zmm4, %zmm1 +; KNL_64-NEXT: vmovapd %zmm3, %zmm0 +; KNL_64-NEXT: vmovapd %zmm4, %zmm1 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test_gather_16f64: @@ -1780,7 +1780,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} ; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 ; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} -; KNL_32-NEXT: vmovaps %zmm2, %zmm0 +; KNL_32-NEXT: vmovapd %zmm2, %zmm0 ; KNL_32-NEXT: movl %ebp, %esp ; KNL_32-NEXT: popl %ebp ; KNL_32-NEXT: retl @@ -1793,8 +1793,8 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} ; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} -; SKX-NEXT: vmovaps %zmm3, %zmm0 -; SKX-NEXT: vmovaps %zmm4, %zmm1 +; SKX-NEXT: vmovapd %zmm3, %zmm0 +; SKX-NEXT: vmovapd %zmm4, %zmm1 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test_gather_16f64: @@ -1817,7 +1817,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, < ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} ; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0 ; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} -; SKX_32-NEXT: vmovaps %zmm2, %zmm0 +; SKX_32-NEXT: vmovapd %zmm2, %zmm0 ; SKX_32-NEXT: movl %ebp, %esp ; SKX_32-NEXT: popl %ebp ; SKX_32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/masked_memop.ll b/llvm/test/CodeGen/X86/masked_memop.ll index 8b5146e5e93c917d917efe35cc2f32442ee7feb8..1b9f80faeaed81b9bc8067a5576fb959e9e80591 100644 --- a/llvm/test/CodeGen/X86/masked_memop.ll +++ b/llvm/test/CodeGen/X86/masked_memop.ll @@ -200,7 +200,7 @@ define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> ; AVX512F-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 ; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovapd %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test5: @@ -208,7 +208,7 @@ define <8 x double> @test5(<8 x i32> %trigger, <8 x double>* %addr, <8 x double> ; SKX-NEXT: vpxord %ymm2, %ymm2, %ymm2 ; SKX-NEXT: vpcmpeqd %ymm2, %ymm0, %k1 ; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm1, %zmm0 ; SKX-NEXT: retq %mask = icmp eq <8 x i32> %trigger, zeroinitializer %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1>%mask, <8 x double>%dst) @@ -501,7 +501,7 @@ define <8 x i32> @test11b(<8 x i1> %mask, <8 x i32>* %addr, <8 x i32> %dst) { ; AVX512F-NEXT: kshiftlw $8, %k0, %k0 ; AVX512F-NEXT: kshiftrw $8, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm1 {%k1} -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test11b: @@ -1314,7 +1314,7 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) { ; AVX512-LABEL: one_mask_bit_set5: ; AVX512: ## BB#0: ; AVX512-NEXT: vextractf32x4 $3, %zmm0, %xmm0 -; AVX512-NEXT: vmovlpd %xmm0, 48(%rdi) +; AVX512-NEXT: vmovlps %xmm0, 48(%rdi) ; AVX512-NEXT: retq call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1>) ret void @@ -1877,8 +1877,8 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64 ; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 ; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 -; AVX512F-NEXT: vmovaps %zmm2, %zmm1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_16i64: @@ -1888,8 +1888,8 @@ define <16 x i64> @test_load_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64 ; SKX-NEXT: vmovdqu64 (%rdi), %zmm1 {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovdqu64 64(%rdi), %zmm2 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 -; SKX-NEXT: vmovaps %zmm2, %zmm1 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm2, %zmm1 ; SKX-NEXT: retq %res = call <16 x i64> @llvm.masked.load.v16i64.p0v16i64(<16 x i64>* %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) ret <16 x i64> %res @@ -1981,8 +1981,8 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 ; AVX512F-NEXT: vmovupd (%rdi), %zmm1 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k1, %k1 ; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 -; AVX512F-NEXT: vmovaps %zmm2, %zmm1 +; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vmovapd %zmm2, %zmm1 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_16f64: @@ -1992,8 +1992,8 @@ define <16 x double> @test_load_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 ; SKX-NEXT: vmovupd (%rdi), %zmm1 {%k1} ; SKX-NEXT: kshiftrw $8, %k1, %k1 ; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 -; SKX-NEXT: vmovaps %zmm2, %zmm1 +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm1 ; SKX-NEXT: retq %res = call <16 x double> @llvm.masked.load.v16f64.p0v16f64(<16 x double>* %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) ret <16 x double> %res @@ -2204,10 +2204,10 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 ; AVX512F-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} ; AVX512F-NEXT: kshiftrw $8, %k2, %k1 ; AVX512F-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 -; AVX512F-NEXT: vmovaps %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm3, %zmm2 -; AVX512F-NEXT: vmovaps %zmm4, %zmm3 +; AVX512F-NEXT: vmovapd %zmm1, %zmm0 +; AVX512F-NEXT: vmovapd %zmm2, %zmm1 +; AVX512F-NEXT: vmovapd %zmm3, %zmm2 +; AVX512F-NEXT: vmovapd %zmm4, %zmm3 ; AVX512F-NEXT: retq ; ; SKX-LABEL: test_load_32f64: @@ -2221,10 +2221,10 @@ define <32 x double> @test_load_32f64(<32 x double>* %ptrs, <32 x i1> %mask, <32 ; SKX-NEXT: vmovupd 64(%rdi), %zmm2 {%k1} ; SKX-NEXT: kshiftrw $8, %k2, %k1 ; SKX-NEXT: vmovupd 192(%rdi), %zmm4 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 -; SKX-NEXT: vmovaps %zmm2, %zmm1 -; SKX-NEXT: vmovaps %zmm3, %zmm2 -; SKX-NEXT: vmovaps %zmm4, %zmm3 +; SKX-NEXT: vmovapd %zmm1, %zmm0 +; SKX-NEXT: vmovapd %zmm2, %zmm1 +; SKX-NEXT: vmovapd %zmm3, %zmm2 +; SKX-NEXT: vmovapd %zmm4, %zmm3 ; SKX-NEXT: retq %res = call <32 x double> @llvm.masked.load.v32f64.p0v32f64(<32 x double>* %ptrs, i32 4, <32 x i1> %mask, <32 x double> %src0) ret <32 x double> %res @@ -5538,7 +5538,7 @@ define <64 x i8> @test_mask_load_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x ; SKX-NEXT: vpsllw $7, %zmm0, %zmm0 ; SKX-NEXT: vpmovb2m %zmm0, %k1 ; SKX-NEXT: vmovdqu8 (%rdi), %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* %addr, i32 4, <64 x i1>%mask, <64 x i8> %val) ret <64 x i8> %res @@ -6912,7 +6912,7 @@ define <32 x i16> @test_mask_load_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 ; SKX-NEXT: vpsllw $7, %ymm0, %ymm0 ; SKX-NEXT: vpmovb2m %ymm0, %k1 ; SKX-NEXT: vmovdqu16 (%rdi), %zmm1 {%k1} -; SKX-NEXT: vmovaps %zmm1, %zmm0 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 ; SKX-NEXT: retq %res = call <32 x i16> @llvm.masked.load.v32i16.p0v32i16(<32 x i16>* %addr, i32 4, <32 x i1>%mask, <32 x i16> %val) ret <32 x i16> %res diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll index bb9a342ae9ae46352d4e8de47e85f15a1ce76c40..bc06d8f190448d8bf22e084d59deea93cc9a72d3 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -83,13 +83,13 @@ define <8 x double> @merge_8f64_4f64_z2(<4 x double>* %ptr) nounwind uwtable noi define <8 x double> @merge_8f64_f64_23uuuuu9(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_23uuuuu9: ; ALL: # BB#0: -; ALL-NEXT: vmovupd 16(%rdi), %zmm0 +; ALL-NEXT: vmovups 16(%rdi), %zmm0 ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8f64_f64_23uuuuu9: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovupd 16(%eax), %zmm0 +; X32-AVX512F-NEXT: vmovups 16(%eax), %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 2 %ptr1 = getelementptr inbounds double, double* %ptr, i64 3 @@ -138,7 +138,7 @@ define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noin define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovupd 8(%rdi), %zmm0 +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 ; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; ALL-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,u,2,u,4,13,u,7> ; ALL-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 @@ -147,7 +147,7 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noin ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 ; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> ; X32-AVX512F-NEXT: vpermt2pd %zmm1, %zmm2, %zmm0 @@ -334,7 +334,7 @@ define <16 x float> @merge_16f32_f32_0uu3uuuuuuuuCuEF(float* %ptr) nounwind uwta define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovups (%rdi), %zmm0 +; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 ; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> ; ALL-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 @@ -343,7 +343,7 @@ define <16 x float> @merge_16f32_f32_0uu3zzuuuuuzCuEF(float* %ptr) nounwind uwta ; X32-AVX512F-LABEL: merge_16f32_f32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovups (%eax), %zmm0 +; X32-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 ; X32-AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1 ; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> ; X32-AVX512F-NEXT: vpermt2ps %zmm1, %zmm2, %zmm0 diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll index e221f8e9520b5861c6ed081c864bc70522b4b69f..b97d38e18f4482e6eacda1a9383a011ac7958d55 100644 --- a/llvm/test/CodeGen/X86/nontemporal-2.ll +++ b/llvm/test/CodeGen/X86/nontemporal-2.ll @@ -117,7 +117,7 @@ define void @test_zero_v4f32(<4 x float>* %dst) { ; VLX-LABEL: test_zero_v4f32: ; VLX: # BB#0: ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <4 x float> zeroinitializer, <4 x float>* %dst, align 16, !nontemporal !1 ret void @@ -139,7 +139,7 @@ define void @test_zero_v4i32(<4 x i32>* %dst) { ; VLX-LABEL: test_zero_v4i32: ; VLX: # BB#0: ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 store <4 x i32> zeroinitializer, <4 x i32>* %dst, align 16, !nontemporal !1 @@ -162,7 +162,7 @@ define void @test_zero_v2f64(<2 x double>* %dst) { ; VLX-LABEL: test_zero_v2f64: ; VLX: # BB#0: ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <2 x double> zeroinitializer, <2 x double>* %dst, align 16, !nontemporal !1 ret void @@ -184,7 +184,7 @@ define void @test_zero_v2i64(<2 x i64>* %dst) { ; VLX-LABEL: test_zero_v2i64: ; VLX: # BB#0: ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <2 x i64> zeroinitializer, <2 x i64>* %dst, align 16, !nontemporal !1 ret void @@ -206,7 +206,7 @@ define void @test_zero_v8i16(<8 x i16>* %dst) { ; VLX-LABEL: test_zero_v8i16: ; VLX: # BB#0: ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <8 x i16> zeroinitializer, <8 x i16>* %dst, align 16, !nontemporal !1 ret void @@ -228,7 +228,7 @@ define void @test_zero_v16i8(<16 x i8>* %dst) { ; VLX-LABEL: test_zero_v16i8: ; VLX: # BB#0: ; VLX-NEXT: vpxord %xmm0, %xmm0, %xmm0 -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <16 x i8> zeroinitializer, <16 x i8>* %dst, align 16, !nontemporal !1 ret void @@ -657,7 +657,7 @@ define void @test_arg_v4i32(<4 x i32> %arg, <4 x i32>* %dst) { ; ; VLX-LABEL: test_arg_v4i32: ; VLX: # BB#0: -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <4 x i32> %arg, <4 x i32>* %dst, align 16, !nontemporal !1 ret void @@ -676,7 +676,7 @@ define void @test_arg_v2f64(<2 x double> %arg, <2 x double>* %dst) { ; ; VLX-LABEL: test_arg_v2f64: ; VLX: # BB#0: -; VLX-NEXT: vmovntpd %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <2 x double> %arg, <2 x double>* %dst, align 16, !nontemporal !1 ret void @@ -695,7 +695,7 @@ define void @test_arg_v2i64(<2 x i64> %arg, <2 x i64>* %dst) { ; ; VLX-LABEL: test_arg_v2i64: ; VLX: # BB#0: -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <2 x i64> %arg, <2 x i64>* %dst, align 16, !nontemporal !1 ret void @@ -714,7 +714,7 @@ define void @test_arg_v8i16(<8 x i16> %arg, <8 x i16>* %dst) { ; ; VLX-LABEL: test_arg_v8i16: ; VLX: # BB#0: -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <8 x i16> %arg, <8 x i16>* %dst, align 16, !nontemporal !1 ret void @@ -733,7 +733,7 @@ define void @test_arg_v16i8(<16 x i8> %arg, <16 x i8>* %dst) { ; ; VLX-LABEL: test_arg_v16i8: ; VLX: # BB#0: -; VLX-NEXT: vmovntdq %xmm0, (%rdi) +; VLX-NEXT: vmovntps %xmm0, (%rdi) ; VLX-NEXT: retq store <16 x i8> %arg, <16 x i8>* %dst, align 16, !nontemporal !1 ret void diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll index 83301e60a1c4ae868778755fe844387abc1ed3b6..deb0cb9bbb2dfe4669d7f6f5f59e579c40aaeef0 100644 --- a/llvm/test/CodeGen/X86/nontemporal-loads.ll +++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll @@ -1536,7 +1536,7 @@ define <8 x double> @test_unaligned_v8f64(<8 x double>* %src) { ; ; AVX512-LABEL: test_unaligned_v8f64: ; AVX512: # BB#0: -; AVX512-NEXT: vmovupd (%rdi), %zmm0 +; AVX512-NEXT: vmovups (%rdi), %zmm0 ; AVX512-NEXT: retq %1 = load <8 x double>, <8 x double>* %src, align 1, !nontemporal !1 ret <8 x double> %1 diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll index 5f2c88d670acbae61f19af4c970487298e432996..5ae2064010605593f6737f0e8cc709931aab2bee 100644 --- a/llvm/test/CodeGen/X86/pmul.ll +++ b/llvm/test/CodeGen/X86/pmul.ll @@ -391,7 +391,7 @@ define <2 x i64> @mul_v2i64spill(<2 x i64> %i, <2 x i64> %j) nounwind { ; AVX512-NEXT: vmovdqa (%rsp), %xmm4 # 16-byte Reload ; AVX512-NEXT: vpmuludq %xmm2, %xmm4, %xmm0 ; AVX512-NEXT: vpsrlq $32, %xmm2, %xmm1 -; AVX512-NEXT: vmovaps %zmm2, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 ; AVX512-NEXT: vpmuludq %xmm1, %xmm4, %xmm1 ; AVX512-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX512-NEXT: vpsrlq $32, %xmm4, %xmm2 diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll index 4a16c3198aa5f65fbae0c439bc59026ad97488e0..9ea86b08f7ae6a1972d9652c9be0216705215559 100644 --- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll @@ -75,7 +75,7 @@ define x86_fp80 @s32_to_x(i32 %a) nounwind { ; CHECK-LABEL: u64_to_f ; AVX512_32: vmovq {{.*#+}} xmm0 = mem[0],zero -; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp) +; AVX512_32: vmovlps %xmm0, {{[0-9]+}}(%esp) ; AVX512_32: fildll ; AVX512_64: vcvtusi2ssq @@ -111,7 +111,7 @@ define float @s64_to_f(i64 %a) nounwind { ; AVX512_32: vmovd %eax, %xmm0 ; AVX512_32: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp) +; AVX512_32: vmovq %xmm0, {{[0-9]+}}(%esp) ; AVX512_32: fildll {{[0-9]+}}(%esp) define float @s64_to_f_2(i64 %a) nounwind { @@ -151,7 +151,7 @@ define double @s64_to_d(i64 %a) nounwind { ; AVX512_32: vmovd %eax, %xmm0 ; AVX512_32: vpinsrd $1, %ecx, %xmm0, %xmm0 -; AVX512_32: vmovlpd %xmm0, {{[0-9]+}}(%esp) +; AVX512_32: vmovq %xmm0, {{[0-9]+}}(%esp) ; AVX512_32: fildll define double @s64_to_d_2(i64 %a) nounwind { diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll index 5dec0b8fa11b1fa2aa5e07b3e54a2f81f1682e2a..b457969d31e6fbb1d9c96d2c88c5de907eac4c60 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll @@ -76,7 +76,7 @@ declare <4 x float> @llvm.x86.sse.add.ss(<4 x float>, <4 x float>) nounwind read define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_andpd - ;CHECK: vpandq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = bitcast <2 x double> %a0 to <2 x i64> %3 = bitcast <2 x double> %a1 to <2 x i64> @@ -89,7 +89,7 @@ define <2 x double> @stack_fold_andpd(<2 x double> %a0, <2 x double> %a1) { define <4 x double> @stack_fold_andpd_ymm(<4 x double> %a0, <4 x double> %a1) { ;CHECK-LABEL: stack_fold_andpd_ymm - ;CHECK: vpandq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + ;CHECK: vandpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = bitcast <4 x double> %a0 to <4 x i64> %3 = bitcast <4 x double> %a1 to <4 x i64> @@ -198,7 +198,7 @@ declare <4 x float> @llvm.x86.sse.mul.ss(<4 x float>, <4 x float>) nounwind read define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_orpd - ;CHECK: vporq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = bitcast <2 x double> %a0 to <2 x i64> %3 = bitcast <2 x double> %a1 to <2 x i64> @@ -211,7 +211,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) { define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) { ;CHECK-LABEL: stack_fold_orpd_ymm - ;CHECK: vporq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + ;CHECK: vorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = bitcast <4 x double> %a0 to <4 x i64> %3 = bitcast <4 x double> %a1 to <4 x i64> @@ -316,7 +316,7 @@ declare <4 x float> @llvm.x86.sse.sub.ss(<4 x float>, <4 x float>) nounwind read define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) { ;CHECK-LABEL: stack_fold_xorpd - ;CHECK: vpxorq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = bitcast <2 x double> %a0 to <2 x i64> %3 = bitcast <2 x double> %a1 to <2 x i64> @@ -329,7 +329,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) { define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) { ;CHECK-LABEL: stack_fold_xorpd_ymm - ;CHECK: vpxorq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + ;CHECK: vxorpd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = bitcast <4 x double> %a0 to <4 x i64> %3 = bitcast <4 x double> %a1 to <4 x i64> diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll index b091d1bca2eff83ea3615f5985fc7fed5e891138..3261e988ffb6c07d5675d91c54338edd2aab4848 100644 --- a/llvm/test/CodeGen/X86/vector-half-conversions.ll +++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll @@ -3001,7 +3001,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX512-NEXT: .cfi_offset %r14, -24 ; AVX512-NEXT: .Ltmp24: ; AVX512-NEXT: .cfi_offset %r15, -16 -; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill +; AVX512-NEXT: vmovupd %zmm0, (%rsp) # 64-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movw %ax, %bx @@ -3011,9 +3011,9 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movzwl %ax, %r15d ; AVX512-NEXT: orl %ebx, %r15d -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movw %ax, %bx @@ -3024,7 +3024,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) { ; AVX512-NEXT: orl %ebx, %r14d ; AVX512-NEXT: shlq $32, %r14 ; AVX512-NEXT: orq %r15, %r14 -; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovupd (%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -3862,17 +3862,17 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) { ; AVX512-NEXT: .Ltmp67: ; AVX512-NEXT: .cfi_offset %rbp, -16 ; AVX512-NEXT: movq %rdi, %rbx -; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill +; AVX512-NEXT: vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill +; AVX512-NEXT: vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; AVX512-NEXT: callq __truncdfhf2 ; AVX512-NEXT: movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill -; AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload ; AVX512-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; AVX512-NEXT: vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill ; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll index d75184951344165b249f14c32eac0a5d737e87ef..4d441aa67ce5cee57433a421252529770c4888e7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -262,7 +262,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) { ; ALL-LABEL: shuffle_v16f32_extract_256: ; ALL: # BB#0: -; ALL-NEXT: vmovups (%rsi), %zmm0 +; ALL-NEXT: vmovupd (%rsi), %zmm0 ; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0 ; ALL-NEXT: retq %ptr_a = bitcast float* %a to <16 x float>* diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll index d39961d9c427a2b924fb5e83ddae45b8964fd718..da8ea83ee94e5987ece8259d026fe692f558421d 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -268,14 +268,14 @@ define <8 x double> @shuffle_v8f64_8823cc67(<8 x double> %a, <8 x double> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] ; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_8823cc67: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] ; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -287,14 +287,14 @@ define <8 x double> @shuffle_v8f64_9832dc76(<8 x double> %a, <8 x double> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] ; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_9832dc76: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] ; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -306,14 +306,14 @@ define <8 x double> @shuffle_v8f64_9810dc54(<8 x double> %a, <8 x double> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] ; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_9810dc54: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] ; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -376,14 +376,14 @@ define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] ; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_08991abb: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] ; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -412,14 +412,14 @@ define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] ; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_09ab1def: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] ; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -933,14 +933,14 @@ define <8 x double> @shuffle_v8f64_c348cda0(<8 x double> %a, <8 x double> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,11,12,0,4,5,2,8] ; AVX512F-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8f64_c348cda0: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,0,11,0,12,0,0,0,4,0,5,0,2,0,8,0] ; AVX512F-32-NEXT: vpermt2pd %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> ret <8 x double> %shuffle @@ -1191,14 +1191,14 @@ define <8 x i64> @shuffle_v8i64_81a3c5e7(<8 x i64> %a, <8 x i64> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0] ; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1244,14 +1244,14 @@ define <8 x i64> @shuffle_v8i64_8823cc67(<8 x i64> %a, <8 x i64> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,10,11,4,4,14,15] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_8823cc67: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,10,0,11,0,4,0,4,0,14,0,15,0] ; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1263,14 +1263,14 @@ define <8 x i64> @shuffle_v8i64_9832dc76(<8 x i64> %a, <8 x i64> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,11,10,5,4,15,14] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_9832dc76: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,11,0,10,0,5,0,4,0,15,0,14,0] ; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1282,14 +1282,14 @@ define <8 x i64> @shuffle_v8i64_9810dc54(<8 x i64> %a, <8 x i64> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,9,8,5,4,13,12] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_9810dc54: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,0,0,9,0,8,0,5,0,4,0,13,0,12,0] ; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1352,14 +1352,14 @@ define <8 x i64> @shuffle_v8i64_08991abb(<8 x i64> %a, <8 x i64> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,1,9,2,3,3] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_08991abb: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,0,0,1,0,1,0,9,0,2,0,3,0,3,0] ; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1388,14 +1388,14 @@ define <8 x i64> @shuffle_v8i64_09ab1def(<8 x i64> %a, <8 x i64> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,1,2,3,9,5,6,7] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_09ab1def: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,0,1,0,2,0,3,0,9,0,5,0,6,0,7,0] ; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle @@ -1925,14 +1925,14 @@ define <8 x i64> @shuffle_v8i64_6caa87e5(<8 x i64> %a, <8 x i64> %b) { ; AVX512F: # BB#0: ; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,4,2,2,0,15,6,13] ; AVX512F-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_6caa87e5: ; AVX512F-32: # BB#0: ; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,0,4,0,2,0,2,0,0,0,15,0,6,0,13,0] ; AVX512F-32-NEXT: vpermt2q %zmm0, %zmm2, %zmm1 -; AVX512F-32-NEXT: vmovaps %zmm1, %zmm0 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll index baf1054170bacb81061e73c7a7b736a4ec6c1426..3e0c3e4d8f3f89c4a8a2df95785e36084ec58705 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll @@ -33,7 +33,7 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x d ; CHECK-NEXT: vpermpd %zmm0, %zmm2, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; CHECK-NEXT: vpermpd %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res0 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 %m) %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %res0, <8 x i64> , <8 x double> %res0, i8 %m) @@ -56,7 +56,7 @@ define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x ; CHECK-NEXT: vpermq %zmm0, %zmm2, %zmm1 {%k1} ; CHECK-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8] ; CHECK-NEXT: vpermq %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %res0 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %m) %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %res0, <8 x i64> , <8 x i64> %res0, i8 %m) @@ -168,10 +168,10 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup(<16 x float> %x0, <16 x f define <16 x float> @combine_vpermt2var_16f32_vmovddup_load(<16 x float> *%p0, <16 x float> %x1) { ; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_load: ; CHECK: # BB#0: -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 ; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 -1) @@ -191,10 +191,10 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *% ; CHECK-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load: ; CHECK: # BB#0: ; CHECK-NEXT: kmovw %esi, %k1 -; CHECK-NEXT: vmovaps (%rdi), %zmm1 +; CHECK-NEXT: vmovdqa64 (%rdi), %zmm1 ; CHECK-NEXT: vmovdqa32 {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13] ; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 {%k1} {z} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %x0 = load <16 x float>, <16 x float> *%p0 %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> , <16 x float> %x0, <16 x float> %x1, i16 %m) @@ -365,7 +365,7 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) { ; CHECK-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 ; CHECK-NEXT: vpshufb %zmm2, %zmm0, %zmm3 {%k1} ; CHECK-NEXT: vpshufb %zmm2, %zmm3, %zmm1 {%k1} -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %select = bitcast <8 x i64> to <64 x i8> %mask = bitcast <16 x i32> to <64 x i8> @@ -414,7 +414,7 @@ define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x ; CHECK: # BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> , <8 x i64> %x1, i8 %m) ret <8 x i64> %1 @@ -433,7 +433,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x ; CHECK: # BB#0: ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4] -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 ; CHECK-NEXT: retq %1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> , <8 x double> %x1, i8 %m) ret <8 x double> %1 diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index a7794afba3d1dfabf1d872812285d7b6f27c6ceb..dde1196803042a6f5e333708ea82b7a2ffc9534b 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -3074,7 +3074,7 @@ define <8 x i16> @trunc_and_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX512-LABEL: trunc_and_v8i64_8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vandps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, %a1 @@ -3213,8 +3213,8 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX512-LABEL: trunc_and_v16i64_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vandps %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vandps %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -3445,7 +3445,7 @@ define <8 x i16> @trunc_and_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_and_const_v16i64_v16i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = and <8 x i64> %a0, @@ -3587,8 +3587,8 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_and_const_v16i64_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vandps {{.*}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -3812,7 +3812,7 @@ define <8 x i16> @trunc_xor_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX512-LABEL: trunc_xor_v8i64_8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vxorps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, %a1 @@ -3951,8 +3951,8 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX512-LABEL: trunc_xor_v16i64_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vxorps %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vxorps %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -4183,7 +4183,7 @@ define <8 x i16> @trunc_xor_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_xor_const_v16i64_v16i16: ; AVX512: # BB#0: -; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = xor <8 x i64> %a0, @@ -4325,8 +4325,8 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512-NEXT: vpxorq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vxorps {{.*}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vxorps {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -4550,7 +4550,7 @@ define <8 x i16> @trunc_or_v8i64_8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX512-LABEL: trunc_or_v8i64_8i16: ; AVX512: # BB#0: -; AVX512-NEXT: vporq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vorps %zmm1, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, %a1 @@ -4689,8 +4689,8 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; ; AVX512-LABEL: trunc_or_v16i64_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1 -; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0 +; AVX512-NEXT: vorps %zmm3, %zmm1, %zmm1 +; AVX512-NEXT: vorps %zmm2, %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -4921,7 +4921,7 @@ define <8 x i16> @trunc_or_const_v16i64_v16i16(<8 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_or_const_v16i64_v16i16: ; AVX512: # BB#0: -; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vorps {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512-NEXT: retq %1 = or <8 x i64> %a0, @@ -5063,8 +5063,8 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX512-LABEL: trunc_or_const_v16i64_v16i8: ; AVX512: # BB#0: -; AVX512-NEXT: vporq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512-NEXT: vporq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512-NEXT: vorps {{.*}}(%rip), %zmm1, %zmm1 +; AVX512-NEXT: vorps {{.*}}(%rip), %zmm0, %zmm0 ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vpmovqd %zmm1, %ymm1 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0