From 173c68239d1d11f4e36c8af07a28310da67568a7 Mon Sep 17 00:00:00 2001 From: Fabian Ritter Date: Fri, 11 Oct 2024 08:50:49 +0200 Subject: [PATCH 001/345] [AMDGPU] Enable unaligned scratch accesses (#110219) This allows us to emit wide generic and scratch memory accesses when we do not have alignment information. In cases where accesses happen to be properly aligned or where generic accesses do not go to scratch memory, this improves performance of the generated code by a factor of up to 16x and reduces code size, especially when lowering memcpy and memmove intrinsics. Also: Make the use of the FeatureUnalignedScratchAccess feature more consistent: FeatureUnalignedScratchAccess and EnableFlatScratch are now orthogonal, whereas, before, code assumed that the latter implies the former at some places. Part of SWDEV-455845. --- llvm/lib/Target/AMDGPU/AMDGPU.td | 24 +- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 4 +- llvm/lib/Target/AMDGPU/GCNSubtarget.h | 4 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 16 +- .../CodeGen/AMDGPU/GlobalISel/flat-scratch.ll | 1037 ++- .../AMDGPU/GlobalISel/legalize-load-flat.mir | 3222 +------- .../GlobalISel/legalize-load-private.mir | 5246 +++++++------ llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll | 28 +- .../test/CodeGen/AMDGPU/flat-address-space.ll | 12 +- .../CodeGen/AMDGPU/memcpy-crash-issue63986.ll | 98 +- llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 2438 +----- .../AMDGPU/memcpy-param-combinations.ll | 6516 +++-------------- .../AMDGPU/memmove-param-combinations.ll | 5196 ++----------- llvm/test/CodeGen/AMDGPU/sdwa-commute.ll | 4 +- .../CodeGen/AMDGPU/unaligned-load-store.ll | 28 +- 15 files changed, 6082 insertions(+), 17791 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 25117544d6a8..62fac085897a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1178,9 +1178,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9", FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero, - FeatureVmemWriteVgprInOrder + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureNegativeScratchOffsetBug, FeatureGWS, + FeatureDefaultComponentZero,FeatureVmemWriteVgprInOrder ] >; @@ -1199,9 +1199,9 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts, - FeatureGDS, FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength63, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureImageInsts, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero, FeatureMaxHardClauseLength63, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF64GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAtomicFMinFMaxF64FlatInsts, FeatureVmemWriteVgprInOrder @@ -1223,9 +1223,9 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, - FeatureGWS, FeatureDefaultComponentZero, - FeatureMaxHardClauseLength32, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureGDS, FeatureGWS, + FeatureDefaultComponentZero, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureVmemWriteVgprInOrder ] @@ -1246,9 +1246,9 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, - FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast, - FeatureMaxHardClauseLength32, + FeatureUnalignedBufferAccess, FeatureUnalignedScratchAccess, + FeatureUnalignedDSAccess, FeatureTrue16BitInsts, + FeatureDefaultComponentBroadcast, FeatureMaxHardClauseLength32, FeatureAtomicFMinFMaxF32GlobalInsts, FeatureAtomicFMinFMaxF32FlatInsts, FeatureAgentScopeFineGrainedRemoteMemoryAtomics ] diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 3f4f42377d56..d701bf037fdf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -387,8 +387,8 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, // them later if they may access private memory. We don't have enough context // here, and legalization can handle it. if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) && - ChainSizeInBytes <= ST->getMaxPrivateElementSize(); + return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) && + ChainSizeInBytes <= ST->getMaxPrivateElementSize(); } return true; } diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 194581260931..1ea3beb2855d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -591,6 +591,10 @@ public: return UnalignedScratchAccess; } + bool hasUnalignedScratchAccessEnabled() const { + return UnalignedScratchAccess && UnalignedAccessMode; + } + bool hasUnalignedAccessMode() const { return UnalignedAccessMode; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3d8e03521e2b..8c197f231496 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1824,26 +1824,16 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( Subtarget->hasUnalignedDSAccessEnabled(); } - if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) { - bool AlignedBy4 = Alignment >= Align(4); - if (IsFast) - *IsFast = AlignedBy4; - - return AlignedBy4 || - Subtarget->enableFlatScratch() || - Subtarget->hasUnalignedScratchAccess(); - } - // FIXME: We have to be conservative here and assume that flat operations // will access scratch. If we had access to the IR function, then we // could determine if any private memory was used in the function. - if (AddrSpace == AMDGPUAS::FLAT_ADDRESS && - !Subtarget->hasUnalignedScratchAccess()) { + if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS || + AddrSpace == AMDGPUAS::FLAT_ADDRESS) { bool AlignedBy4 = Alignment >= Align(4); if (IsFast) *IsFast = AlignedBy4; - return AlignedBy4; + return AlignedBy4 || Subtarget->hasUnalignedScratchAccessEnabled(); } // So long as they are correct, wide global memory operations perform better diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll index ce528467cd35..6e2e88f22600 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll @@ -2428,11 +2428,54 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v4, off ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v3, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v4, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v3, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v6, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v4, v8, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -2441,30 +2484,143 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 15 ; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 5, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 6, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v8, 7, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v3, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v8, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v3, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v8, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], 15 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 15 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v3, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v4, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, 0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 6, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr3 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v4, v8, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_i64_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v2, 0 -; UNALIGNED_GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v3, 4, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v6, 5, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 6, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v8, 7, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v3, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v8, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v3, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v8, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2475,12 +2631,39 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 15 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:7 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -2572,59 +2755,293 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v3, off ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v9, v12, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v10, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v6, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v8, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v9, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v11, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v10, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v12, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX10-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX10-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 1 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 2 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 2, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v8, 6, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v9, 8, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v10, 3 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v11, 9, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v12, 10, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v13, 11, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v9, v10, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v8, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v2, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v9, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v11, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v12, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v13, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v4, s2 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, s1 -; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v2, s0 -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 6, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 8, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 10, v0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v12, 3 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 9, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v12, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 11, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr6 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v3i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 -; UNALIGNED_GFX11-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 2, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v10, 3 :: v_dual_add_nc_u32 v5, 1, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v8, 6, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v9, 8, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v11, 9, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v12, 10, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v13, 11, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v8, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v2, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v9, v10, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v11, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v12, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v13, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v8, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v2, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v9, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v11, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v12, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v13, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2635,16 +3052,57 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX12-NEXT: s_wait_alu 0xfffe -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1 -; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, s0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 2 +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 3 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:11 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: @@ -2742,64 +3200,382 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX9-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX9: ; %bb.0: ; %bb ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX9-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX9-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v4, s3 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off -; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v0, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX9-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v7, v1, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v9, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v8, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v10, v13, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v14, v6, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v16, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX9-NEXT: scratch_store_byte v15, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_store_byte v6, v3, off +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v0, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v4, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v2, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v5, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v7, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v9, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v8, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v1, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v10, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v12, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v11, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v13, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v14, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v16, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v3, v15, off glc +; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX9-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX9-NEXT: scratch_load_ubyte v0, v6, off glc ; UNALIGNED_GFX9-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX9-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX10-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX10: ; %bb.0: ; %bb ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX10-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX10-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v4, s3 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, s2 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, s1 -; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, s0 -; UNALIGNED_GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v1, 1 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v2, 2 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v6, 4, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v0, v1, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v5, 2, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v4, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v5, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v1, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v9, 6, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v6, v2, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v7, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v10, 8, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v11, 3 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v12, 9, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v9, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v13, 10, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v2, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v10, v11, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v12, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v11, 11, v0 +; UNALIGNED_GFX10-NEXT: v_mov_b32_e32 v8, 4 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v14, 12, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v15, 13, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v16, 14, v0 +; UNALIGNED_GFX10-NEXT: v_add_nc_u32_e32 v17, 15, v0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v13, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v11, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v14, v8, off ; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc +; UNALIGNED_GFX10-NEXT: scratch_store_byte v15, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v16, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_store_byte v17, v3, off +; UNALIGNED_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v4, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v5, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v1, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v6, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v7, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v9, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v2, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v10, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v12, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v13, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v11, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v14, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v15, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v16, off glc dlc +; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX10-NEXT: scratch_load_ubyte v0, v17, off glc dlc ; UNALIGNED_GFX10-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX10-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX940-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX940: ; %bb.0: ; %bb ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX940-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX940-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; UNALIGNED_GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; UNALIGNED_GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 1 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v1, 2 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v2, 2, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v4, 1, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v0, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v3, 0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v6, 4 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v7, 4, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v8, 6, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v10, 8, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v11, 10, v0 +; UNALIGNED_GFX940-NEXT: v_mov_b32_e32 v13, 3 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v14, 12, v0 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v15, 14, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v4, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v5, 3, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v2, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v5, v3, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) -; UNALIGNED_GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v9, 5, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v7, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v9, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v1, 7, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v8, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v1, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v12, 9, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v10, v13, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v12, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v13, 11, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v11, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v13, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v16, 13, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v14, v6, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v16, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: v_add_u32_e32 v6, 15, v0 +; UNALIGNED_GFX940-NEXT: scratch_store_byte v15, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_store_byte v6, v3, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v0, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v4, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v2, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v5, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v7, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v9, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v8, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v1, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v10, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v12, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v11, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v13, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v14, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v16, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v3, v15, off sc0 sc1 +; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr11 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr4 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr15 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr10 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr7 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr13 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr5 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr14 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr12 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr8 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr2 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr1 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr9 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr16 +; UNALIGNED_GFX940-NEXT: ; kill: killed $vgpr0 +; UNALIGNED_GFX940-NEXT: scratch_load_ubyte v0, v6, off sc0 sc1 ; UNALIGNED_GFX940-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX940-NEXT: s_setpc_b64 s[30:31] ; ; UNALIGNED_GFX11-LABEL: store_load_v4i32_unaligned: ; UNALIGNED_GFX11: ; %bb.0: ; %bb ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; UNALIGNED_GFX11-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX11-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 -; UNALIGNED_GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_add_nc_u32 v4, 1, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v11, 3 :: v_dual_add_nc_u32 v6, 4, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v0, v1, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v1, 3, v0 +; UNALIGNED_GFX11-NEXT: v_dual_mov_b32 v8, 4 :: v_dual_add_nc_u32 v5, 2, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v7, 5, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v4, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v5, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v1, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v9, 6, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v6, v2, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v7, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v2, 7, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v10, 8, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v12, 9, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v9, v3, off dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; UNALIGNED_GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v13, 10, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v2, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v10, v11, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v12, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v11, 11, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v14, 12, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v15, 13, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v16, 14, v0 +; UNALIGNED_GFX11-NEXT: v_add_nc_u32_e32 v17, 15, v0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v13, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v11, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v14, v8, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v15, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v16, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_store_b8 v17, v3, off dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt_vscnt null, 0x0 +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v4, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v5, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v1, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v6, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v7, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v9, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v2, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v10, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v12, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v13, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v11, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v14, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v15, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v16, off glc dlc +; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) +; UNALIGNED_GFX11-NEXT: scratch_load_u8 v0, v17, off glc dlc ; UNALIGNED_GFX11-NEXT: s_waitcnt vmcnt(0) ; UNALIGNED_GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -2810,17 +3586,74 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) { ; UNALIGNED_GFX12-NEXT: s_wait_samplecnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_bvhcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_wait_kmcnt 0x0 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s3, 4 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s2, 3 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s1, 2 -; UNALIGNED_GFX12-NEXT: s_mov_b32 s0, 1 -; UNALIGNED_GFX12-NEXT: s_wait_alu 0xfffe -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2 -; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0 +; UNALIGNED_GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 2 +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v1, 3 +; UNALIGNED_GFX12-NEXT: v_mov_b32_e32 v3, 4 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v1, off offset:8 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:9 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 -; UNALIGNED_GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v3, off offset:12 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:13 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:14 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_store_b8 v0, v2, off offset:15 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_storecnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:1 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:2 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:3 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:4 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:5 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:6 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:7 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:8 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:9 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:10 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:11 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:12 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:13 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v1, v0, off offset:14 scope:SCOPE_SYS +; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 +; UNALIGNED_GFX12-NEXT: scratch_load_u8 v0, v0, off offset:15 scope:SCOPE_SYS ; UNALIGNED_GFX12-NEXT: s_wait_loadcnt 0x0 ; UNALIGNED_GFX12-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir index b1d7d36f9912..032ca7c0d4fe 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir @@ -483,40 +483,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -664,40 +646,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s32_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s32_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -798,70 +762,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_s32_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_s32_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s32_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -1247,76 +1163,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX11PLUS-LABEL: name: test_load_flat_s64_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX12-LABEL: name: test_load_flat_s64_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -1485,130 +1347,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX11PLUS-LABEL: name: test_load_flat_s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; GFX12-LABEL: name: test_load_flat_s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s64_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -2075,87 +1829,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX11PLUS-LABEL: name: test_load_flat_s96_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX12-LABEL: name: test_load_flat_s96_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 2) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align2 @@ -2369,165 +2060,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX11PLUS-LABEL: name: test_load_flat_s96_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; GFX12-LABEL: name: test_load_flat_s96_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p0) :: (load (<3 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s96_align1 @@ -3334,210 +2884,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; GFX11PLUS-LABEL: name: test_load_flat_s128_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; GFX12-LABEL: name: test_load_flat_s128_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_s128_align1 @@ -4132,133 +3496,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; GFX11PLUS-LABEL: name: test_load_flat_p1_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; GFX12-LABEL: name: test_load_flat_p1_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p0) :: (load (p1), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p1_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -4662,79 +3915,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX11PLUS-LABEL: name: test_load_flat_p4_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX12-LABEL: name: test_load_flat_p4_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -4906,133 +4102,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX11PLUS-LABEL: name: test_load_flat_p4_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; GFX12-LABEL: name: test_load_flat_p4_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p0) :: (load (p4), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p4_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5274,43 +4359,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11PLUS-LABEL: name: test_load_flat_p5_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX12-LABEL: name: test_load_flat_p5_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5416,73 +4480,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11PLUS-LABEL: name: test_load_flat_p5_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX12-LABEL: name: test_load_flat_p5_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_p5_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -5732,40 +4745,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v2s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6158,121 +5153,106 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX9PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX9PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v3s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX11PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX11PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX11PLUS-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX11PLUS-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX11PLUS-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX12-LABEL: name: test_load_flat_v3s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 2) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6503,40 +5483,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v4s8_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -6638,70 +5600,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s8_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX12-LABEL: name: test_load_flat_v4s8_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s8_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -7185,40 +6099,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v2s16_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 2) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -7327,70 +6223,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX11PLUS-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v2s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>), align 1) + ; GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -8291,36 +7139,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX9PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX9PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8334,36 +7168,22 @@ body: | ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX11PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX11PLUS-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX11PLUS-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX11PLUS-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8377,36 +7197,22 @@ body: | ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 1) + ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2, align 1) + ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) + ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4, align 1) + ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) + ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) ; GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -8765,70 +7571,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v4s16_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C1]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -9005,124 +7763,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s16_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; GFX12-LABEL: name: test_load_flat_v4s16_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C3]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 6 - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load (<4 x s16>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s16_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -10686,133 +9342,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align2 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v2s64_align2 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s16)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s16) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s16) from unknown-address + 6) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s16) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p0) :: (load (s16) from unknown-address + 10) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR3]](s32) - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s16) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s16) from unknown-address + 14) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR4]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C3]](s32) - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 2) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align2 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -11100,235 +9645,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v2s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2s64_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 @@ -12078,342 +10410,42 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v3s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX11PLUS-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX11PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX11PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v3s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p0) :: (load (s64) from unknown-address + 16, align 1) + ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[LOAD]](<2 x s64>) ; GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s64>) = G_IMPLICIT_DEF - ; GFX12-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64) + ; GFX12-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>) + ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v3s64_align1 @@ -13306,441 +11338,33 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX9PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX9PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX9PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX9PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX9PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX9PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX9PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX9PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX9PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX9PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX9PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX9PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX9PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX9PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX9PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX9PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX9PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX9PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX9PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX9PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX9PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX9PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX9PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX9PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX9PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX9PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX9PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX9PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX9PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX9PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX9PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX9PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX9PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX9PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX9PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX9PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX9PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX9PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX9PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX9PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX9PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v4s64_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX11PLUS-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX11PLUS-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX11PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX11PLUS-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX11PLUS-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX11PLUS-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX11PLUS-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX11PLUS-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX11PLUS-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX11PLUS-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX11PLUS-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX11PLUS-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX11PLUS-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX11PLUS-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX11PLUS-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX11PLUS-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX11PLUS-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX11PLUS-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX11PLUS-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX11PLUS-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX11PLUS-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX11PLUS-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX11PLUS-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX11PLUS-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX11PLUS-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX11PLUS-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX11PLUS-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX11PLUS-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX11PLUS-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX11PLUS-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX11PLUS-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX11PLUS-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX11PLUS-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX11PLUS-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX11PLUS-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX11PLUS-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX11PLUS-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX11PLUS-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX11PLUS-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX11PLUS-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX11PLUS-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX11PLUS-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; GFX12-LABEL: name: test_load_flat_v4s64_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p0) :: (load (<2 x s64>), align 1) + ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] - ; GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) - ; GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] - ; GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) - ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) - ; GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) - ; GFX12-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 - ; GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C7]](s64) - ; GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p0) :: (load (s8) from unknown-address + 16) - ; GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p0) :: (load (s8) from unknown-address + 17) - ; GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[ZEXTLOAD12]] - ; GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p0) :: (load (s8) from unknown-address + 18) - ; GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD17]], [[C]](s64) - ; GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p0) :: (load (s8) from unknown-address + 19) - ; GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD14]] - ; GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[OR15]], [[C3]](s32) - ; GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[OR14]] - ; GFX12-NEXT: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR16]](s32) - ; GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p0) :: (load (s8) from unknown-address + 20) - ; GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p0) :: (load (s8) from unknown-address + 21) - ; GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[ZEXTLOAD15]] - ; GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p0) :: (load (s8) from unknown-address + 22) - ; GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD21]], [[C]](s64) - ; GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p0) :: (load (s8) from unknown-address + 23) - ; GFX12-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX12-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[SHL18]], [[ZEXTLOAD17]] - ; GFX12-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[OR18]], [[C3]](s32) - ; GFX12-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[SHL19]], [[OR17]] - ; GFX12-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[OR19]](s32) - ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL20:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT2]], [[COPY2]](s32) - ; GFX12-NEXT: [[OR20:%[0-9]+]]:_(s64) = G_OR [[SHL20]], [[ZEXT2]] - ; GFX12-NEXT: [[PTR_ADD23:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD15]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD18:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD23]](p0) :: (load (s8) from unknown-address + 24) - ; GFX12-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD19:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD24]](p0) :: (load (s8) from unknown-address + 25) - ; GFX12-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD19]], [[C1]](s32) - ; GFX12-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[SHL21]], [[ZEXTLOAD18]] - ; GFX12-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD20:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD25]](p0) :: (load (s8) from unknown-address + 26) - ; GFX12-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD25]], [[C]](s64) - ; GFX12-NEXT: [[LOAD6:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD26]](p0) :: (load (s8) from unknown-address + 27) - ; GFX12-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[LOAD6]], [[C1]](s32) - ; GFX12-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[SHL22]], [[ZEXTLOAD20]] - ; GFX12-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[OR22]], [[C3]](s32) - ; GFX12-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[SHL23]], [[OR21]] - ; GFX12-NEXT: [[ZEXT3:%[0-9]+]]:_(s64) = G_ZEXT [[OR23]](s32) - ; GFX12-NEXT: [[PTR_ADD27:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD21:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD27]](p0) :: (load (s8) from unknown-address + 28) - ; GFX12-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD22:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD28]](p0) :: (load (s8) from unknown-address + 29) - ; GFX12-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD22]], [[C1]](s32) - ; GFX12-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[SHL24]], [[ZEXTLOAD21]] - ; GFX12-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD27]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD23:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD29]](p0) :: (load (s8) from unknown-address + 30) - ; GFX12-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD29]], [[C]](s64) - ; GFX12-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD30]](p0) :: (load (s8) from unknown-address + 31) - ; GFX12-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[LOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[SHL25]], [[ZEXTLOAD23]] - ; GFX12-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[OR25]], [[C3]](s32) - ; GFX12-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[SHL26]], [[OR24]] - ; GFX12-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[OR26]](s32) - ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32) - ; GFX12-NEXT: [[SHL27:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT3]], [[COPY3]](s32) - ; GFX12-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]] - ; GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR20]](s64), [[OR27]](s64) - ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s64>), [[BUILD_VECTOR1]](<2 x s64>) + ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16, align 1) + ; GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v4s64_align1 @@ -14762,210 +12386,24 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX9PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX9PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX9PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX9PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX9PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX9PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX9PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX9PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX9PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX9PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX9PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p1_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX11PLUS-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX11PLUS-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX11PLUS-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX11PLUS-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX11PLUS-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX11PLUS-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX11PLUS-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX11PLUS-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX11PLUS-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX11PLUS-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX11PLUS-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX11PLUS-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX11PLUS-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX11PLUS-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX11PLUS-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX11PLUS-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX11PLUS-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; GFX12-LABEL: name: test_load_flat_v2p1_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 - ; GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C5]](s64) - ; GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p0) :: (load (s8) from unknown-address + 8) - ; GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p0) :: (load (s8) from unknown-address + 9) - ; GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p0) :: (load (s8) from unknown-address + 10) - ; GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD9]], [[C]](s64) - ; GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p0) :: (load (s8) from unknown-address + 11) - ; GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX12-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 12 - ; GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C6]](s64) - ; GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p0) :: (load (s8) from unknown-address + 12) - ; GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p0) :: (load (s8) from unknown-address + 13) - ; GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p0) :: (load (s8) from unknown-address + 14) - ; GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD13]], [[C]](s64) - ; GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s8) from unknown-address + 15) - ; GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) - ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p0) :: (load (<4 x s32>), align 1) + ; GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>) ; GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p1_align1 @@ -15422,124 +12860,22 @@ body: | ; GFX9PLUS: liveins: $vgpr0_vgpr1 ; GFX9PLUS-NEXT: {{ $}} ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX9PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX9PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX9PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX9PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX9PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX9PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX9PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX9PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX9PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX9PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX11PLUS-LABEL: name: test_load_flat_v2p3_align1 ; GFX11PLUS: liveins: $vgpr0_vgpr1 ; GFX11PLUS-NEXT: {{ $}} ; GFX11PLUS-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX11PLUS-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX11PLUS-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX11PLUS-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX11PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX11PLUS-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX11PLUS-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX11PLUS-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX11PLUS-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX11PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX11PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX11PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX11PLUS-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX11PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX11PLUS-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX11PLUS-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX11PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX11PLUS-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX11PLUS-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX11PLUS-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX11PLUS-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX11PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX11PLUS-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX11PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX11PLUS-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX11PLUS-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX11PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX11PLUS-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX11PLUS-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; GFX12-LABEL: name: test_load_flat_v2p3_align1 ; GFX12: liveins: $vgpr0_vgpr1 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1 - ; GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p0) :: (load (s8)) - ; GFX12-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 - ; GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1) - ; GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX12-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 - ; GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2) - ; GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD1]], [[C]](s64) - ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3) - ; GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX12-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 - ; GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C4]](s64) - ; GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p0) :: (load (s8) from unknown-address + 4) - ; GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C]](s64) - ; GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p0) :: (load (s8) from unknown-address + 5) - ; GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s64) - ; GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p0) :: (load (s8) from unknown-address + 6) - ; GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD5]], [[C]](s64) - ; GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s8) from unknown-address + 7) - ; GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX12-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32) - ; GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3) - ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>) + ; GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p0) :: (load (<2 x p3>), align 1) + ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; ; UNALIGNED_GFX9PLUS-LABEL: name: test_load_flat_v2p3_align1 ; UNALIGNED_GFX9PLUS: liveins: $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir index 741f878c86f8..6d93112aae1a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir @@ -636,27 +636,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s16_align1 ; GFX11: liveins: $vgpr0 @@ -702,15 +690,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s16) = G_LOAD %0 :: (load (s16), align 1, addrspace 5) %2:_(s32) = G_ANYEXT %1 @@ -853,27 +853,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s32_align2 ; GFX11: liveins: $vgpr0 @@ -919,15 +907,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -1012,47 +1012,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX10-LABEL: name: test_load_private_s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32) ; ; GFX11-LABEL: name: test_load_private_s32_align1 ; GFX11: liveins: $vgpr0 @@ -1118,15 +1086,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR2]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s32) = G_LOAD %0 :: (load (s32), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -1529,39 +1529,27 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32) + ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX10-LABEL: name: test_load_private_s24_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32) + ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32) ; ; GFX11-LABEL: name: test_load_private_s24_align1 ; GFX11: liveins: $vgpr0 @@ -1631,27 +1619,39 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR1]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s24_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR1]](s32) %0:_(p5) = COPY $vgpr0 %1:_(s24) = G_LOAD %0 :: (load (s24), align 1, addrspace 5) %2:_(s32) = G_ANYEXT %1 @@ -2147,42 +2147,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX10-LABEL: name: test_load_private_s64_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX11-LABEL: name: test_load_private_s64_align2 @@ -2245,15 +2225,51 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load (s64), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -2386,78 +2402,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX10-LABEL: name: test_load_private_s64_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64) ; ; GFX11-LABEL: name: test_load_private_s64_align1 @@ -2556,15 +2516,87 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s64_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p5) :: (load (s64), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64) %0:_(p5) = COPY $vgpr0 %1:_(s64) = G_LOAD %0 :: (load (s64), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -2742,53 +2774,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -2796,53 +2789,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -2974,16 +2928,108 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5) @@ -3381,28 +3427,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3410,28 +3442,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3513,16 +3531,58 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 2, addrspace 5) @@ -3701,53 +3761,14 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3755,53 +3776,14 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; @@ -3933,16 +3915,108 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s96_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96) %0:_(p5) = COPY $vgpr0 %1:_(s96) = G_LOAD %0 :: (load (s96), align 1, addrspace 5) @@ -4166,68 +4240,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4235,68 +4258,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4458,16 +4430,138 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5) @@ -4928,35 +5022,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -4964,35 +5040,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5088,16 +5146,72 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 2, addrspace 5) @@ -5321,68 +5435,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5390,68 +5453,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; @@ -5613,16 +5625,138 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_s128_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128) %0:_(p5) = COPY $vgpr0 %1:_(s128) = G_LOAD %0 :: (load (s128), align 1, addrspace 5) @@ -5932,42 +6066,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX10-LABEL: name: test_load_private_p1_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX11-LABEL: name: test_load_private_p1_align2 @@ -6030,15 +6144,53 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load (p1), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -6171,78 +6323,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX10-LABEL: name: test_load_private_p1_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1) ; ; GFX11-LABEL: name: test_load_private_p1_align1 @@ -6341,15 +6437,89 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p1_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p5) :: (load (p1), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1) %0:_(p5) = COPY $vgpr0 %1:_(p1) = G_LOAD %0 :: (load (p1), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -6494,29 +6664,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX10-LABEL: name: test_load_private_p3_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX11-LABEL: name: test_load_private_p3_align2 ; GFX11: liveins: $vgpr0 @@ -6564,15 +6720,29 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load (p3), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -6660,49 +6830,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX10-LABEL: name: test_load_private_p3_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3) ; ; GFX11-LABEL: name: test_load_private_p3_align1 ; GFX11: liveins: $vgpr0 @@ -6770,15 +6906,49 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p3_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p3) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3) %0:_(p5) = COPY $vgpr0 %1:_(p3) = G_LOAD %0 :: (load (p3), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -6923,29 +7093,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX10-LABEL: name: test_load_private_p5_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11-LABEL: name: test_load_private_p5_align2 ; GFX11: liveins: $vgpr0 @@ -6993,15 +7149,29 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load (p5), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -7089,49 +7259,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX10-LABEL: name: test_load_private_p5_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) - ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5) ; ; GFX11-LABEL: name: test_load_private_p5_align1 ; GFX11: liveins: $vgpr0 @@ -7199,15 +7335,49 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_p5_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](p5) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5) %0:_(p5) = COPY $vgpr0 %1:_(p5) = G_LOAD %0 :: (load (p5), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -7357,30 +7527,20 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s8_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s8_align1 @@ -7437,20 +7597,30 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s8_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), align 1, addrspace 5) @@ -7938,81 +8108,71 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX9-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX10-LABEL: name: test_load_private_v3s8_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) - ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] + ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] + ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) + ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] - ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] - ; GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32) + ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] + ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) + ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] + ; GFX10-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; GFX10-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) + ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] + ; GFX10-NEXT: $vgpr0 = COPY [[OR3]](s32) ; ; GFX11-LABEL: name: test_load_private_v3s8_align1 ; GFX11: liveins: $vgpr0 @@ -8168,71 +8328,81 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) - ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; UNALIGNED_GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) - ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] + ; UNALIGNED_GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] + ; UNALIGNED_GFX11-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) - ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] - ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[OR4]](s32) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s8_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32) - ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32) ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF - ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 - ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; UNALIGNED_GFX12-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]] - ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 - ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C4]](s16) - ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL1]] + ; UNALIGNED_GFX12-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 8 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C5]](s16) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL2]] ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]] + ; UNALIGNED_GFX12-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32) - ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]] - ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16) - ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL2]] - ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) - ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) - ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32) - ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]] - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C4]] + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C5]](s16) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL3]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16) + ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]] + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[OR4]](s32) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s8>) = G_LOAD %0 :: (load (<3 x s8>), align 1, addrspace 5) %2:_(s24) = G_BITCAST %1 @@ -8658,136 +8828,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v16s8_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v16s8_align16 @@ -8944,15 +9012,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v16s8_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<16 x s8>) = G_LOAD %0 :: (load (<16 x s8>), align 1, addrspace 5) %2:_(<4 x s32>) = G_BITCAST %1 @@ -9107,27 +9297,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX10-LABEL: name: test_load_private_v2s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11-LABEL: name: test_load_private_v2s16_align2 ; GFX11: liveins: $vgpr0 @@ -9173,15 +9351,27 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 2, addrspace 5) $vgpr0 = COPY %1 @@ -9278,47 +9468,15 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX10-LABEL: name: test_load_private_v2s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 - ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) ; ; GFX11-LABEL: name: test_load_private_v2s16_align1 ; GFX11: liveins: $vgpr0 @@ -9384,15 +9542,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s16>) = G_LOAD %0 :: (load (<2 x s16>), align 1, addrspace 5) $vgpr0 = COPY %1 @@ -9824,27 +10014,26 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -9853,27 +10042,26 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10215,41 +10403,26 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) - ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10258,41 +10431,26 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[LOAD]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) - ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) - ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) - ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) - ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) + ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32) + ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32) + ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) + ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC1]](s16), [[TRUNC2]](s16) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC3]](s16) ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16) ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>) @@ -10445,22 +10603,36 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; UNALIGNED_GFX11-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNALIGNED_GFX11-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) ; UNALIGNED_GFX11-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; UNALIGNED_GFX11-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -10474,22 +10646,36 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) ; UNALIGNED_GFX12-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF ; UNALIGNED_GFX12-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>) ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>) ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32) - ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C4]](s32) ; UNALIGNED_GFX12-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32) ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>) ; UNALIGNED_GFX12-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32) @@ -10827,44 +11013,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX10-LABEL: name: test_load_private_v4s16_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX11-LABEL: name: test_load_private_v4s16_align2 @@ -10929,15 +11093,47 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11091,80 +11287,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX10-LABEL: name: test_load_private_v4s16_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) - ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; GFX11-LABEL: name: test_load_private_v4s16_align1 @@ -11265,15 +11403,83 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s16_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32) + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16) + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16) + ; UNALIGNED_GFX12-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s16>) = G_LOAD %0 :: (load (<4 x s16>), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11582,42 +11788,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s32_align2 @@ -11680,15 +11866,43 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 2, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -11821,78 +12035,22 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX10-LABEL: name: test_load_private_v2s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; GFX11-LABEL: name: test_load_private_v2s32_align1 @@ -11991,15 +12149,79 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s32>) = G_LOAD %0 :: (load (<2 x s32>), align 1, addrspace 5) $vgpr0_vgpr1 = COPY %1 @@ -12174,106 +12396,28 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; GFX10-LABEL: name: test_load_private_v3s32_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; GFX11-LABEL: name: test_load_private_v3s32_align16 @@ -12400,15 +12544,107 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v3s32_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<3 x s32>) = G_LOAD %0 :: (load (<3 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2 = COPY %1 @@ -12764,136 +13000,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align16 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align16 @@ -13050,15 +13184,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -13493,70 +13749,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align2 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align2 @@ -13647,15 +13867,71 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align2 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 2, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -13875,136 +14151,34 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX10-LABEL: name: test_load_private_v4s32_align1 ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; GFX11-LABEL: name: test_load_private_v4s32_align1 @@ -14161,15 +14335,137 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v4s32_align1 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>) %0:_(p5) = COPY $vgpr0 %1:_(<4 x s32>) = G_LOAD %0 :: (load (<4 x s32>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -15262,68 +15558,17 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -15331,68 +15576,17 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) + ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32) - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32) ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64) ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; @@ -15552,15 +15746,155 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] + ; UNALIGNED_GFX11-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] + ; UNALIGNED_GFX11-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) + ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) + ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) ; ; UNALIGNED_GFX12-LABEL: name: test_load_private_v2s64_align16 ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p5) :: (load (<2 x s64>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[OR8]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[OR7]] + ; UNALIGNED_GFX12-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[OR9]](s32) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[OR11]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[OR10]] + ; UNALIGNED_GFX12-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[OR12]](s32) + ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT1]], [[COPY1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64) + ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>) %0:_(p5) = COPY $vgpr0 %1:_(<2 x s64>) = G_LOAD %0 :: (load (<2 x s64>), align 1, addrspace 5) $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 @@ -18178,98 +18512,23 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX9-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX9-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX9-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX9-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX9-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX9-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) - ; GFX9-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX9-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] - ; GFX9-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) - ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) - ; GFX9-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] - ; GFX9-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) - ; GFX9-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] - ; GFX9-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) - ; GFX9-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) - ; GFX9-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) - ; GFX9-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX9-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] - ; GFX9-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) - ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) - ; GFX9-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] - ; GFX9-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) - ; GFX9-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18280,98 +18539,23 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 1, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 1, addrspace 5) ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) - ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) - ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) - ; GFX10-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] - ; GFX10-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) - ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) - ; GFX10-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) - ; GFX10-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) - ; GFX10-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] - ; GFX10-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) - ; GFX10-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] - ; GFX10-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) - ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] - ; GFX10-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) - ; GFX10-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) - ; GFX10-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] - ; GFX10-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) - ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) - ; GFX10-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] - ; GFX10-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) - ; GFX10-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] - ; GFX10-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) - ; GFX10-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) - ; GFX10-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) - ; GFX10-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) - ; GFX10-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] - ; GFX10-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) - ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) - ; GFX10-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX10-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] - ; GFX10-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) - ; GFX10-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 1, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 1, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 1, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18616,12 +18800,99 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX11-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX11-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] + ; UNALIGNED_GFX11-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] + ; UNALIGNED_GFX11-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -18631,12 +18902,99 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s8), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 1, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p5) :: (load (s8) from unknown-address + 1, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s8) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s8) from unknown-address + 3, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]] + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s8) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD4]](p5) :: (load (s8) from unknown-address + 5, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s8) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s8) from unknown-address + 7, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[OR4]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD6:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s8) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD7:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD8]](p5) :: (load (s8) from unknown-address + 9, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD7]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[SHL6]], [[ZEXTLOAD6]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD8:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s8) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s8) from unknown-address + 11, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[SHL7]], [[ZEXTLOAD8]] + ; UNALIGNED_GFX12-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[OR7]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C5]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD9:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD11]](p5) :: (load (s8) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD10:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD12]](p5) :: (load (s8) from unknown-address + 13, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD10]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[SHL9]], [[ZEXTLOAD9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD11:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD13]](p5) :: (load (s8) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD13]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s8) from unknown-address + 15, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[SHL10]], [[ZEXTLOAD11]] + ; UNALIGNED_GFX12-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[OR10]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD15:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD12:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD15]](p5) :: (load (s8) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD13:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD16]](p5) :: (load (s8) from unknown-address + 17, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD13]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[SHL12]], [[ZEXTLOAD12]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD15]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD14:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD17]](p5) :: (load (s8) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD17]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD18]](p5) :: (load (s8) from unknown-address + 19, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[SHL13]], [[ZEXTLOAD14]] + ; UNALIGNED_GFX12-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[OR13]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[SHL14]], [[OR12]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD19:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD15:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD19]](p5) :: (load (s8) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD16:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD20]](p5) :: (load (s8) from unknown-address + 21, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD16]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[SHL15]], [[ZEXTLOAD15]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD19]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD17:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD21]](p5) :: (load (s8) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD21]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD22]](p5) :: (load (s8) from unknown-address + 23, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[SHL16]], [[ZEXTLOAD17]] + ; UNALIGNED_GFX12-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[OR16]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[SHL17]], [[OR15]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR11]](s32), [[OR14]](s32), [[OR17]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -18818,49 +19176,23 @@ body: | ; GFX9: liveins: $vgpr0 ; GFX9-NEXT: {{ $}} ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX9-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX9-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX9-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX9-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX9-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX9-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) - ; GFX9-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) - ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] - ; GFX9-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) - ; GFX9-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) - ; GFX9-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) - ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] - ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX9-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX9-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5) + ; GFX9-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX9-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5) + ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -18871,49 +19203,23 @@ body: | ; GFX10: liveins: $vgpr0 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; GFX10-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) - ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 2, addrspace 5) + ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 ; GFX10-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) - ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 - ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) - ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] - ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 - ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) - ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) - ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) - ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] - ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 - ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) - ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) - ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) - ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] - ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, align 2, addrspace 5) + ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; GFX10-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32) ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) - ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 - ; GFX10-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) - ; GFX10-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) - ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) - ; GFX10-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) - ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] - ; GFX10-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) - ; GFX10-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) - ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) - ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) - ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] - ; GFX10-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) - ; GFX10-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) - ; GFX10-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) - ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) - ; GFX10-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) - ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] - ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; GFX10-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, align 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C]](s32) + ; GFX10-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p5) :: (load (s32) from unknown-address + 16, align 2, addrspace 5) + ; GFX10-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD2]], [[C1]](s32) + ; GFX10-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s32) from unknown-address + 20, align 2, addrspace 5) + ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32) ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) @@ -19060,12 +19366,50 @@ body: | ; UNALIGNED_GFX11: liveins: $vgpr0 ; UNALIGNED_GFX11-NEXT: {{ $}} ; UNALIGNED_GFX11-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; UNALIGNED_GFX11-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5) - ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX11-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX11-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX11-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; UNALIGNED_GFX11-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX11-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX11-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; UNALIGNED_GFX11-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX11-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) @@ -19075,12 +19419,50 @@ body: | ; UNALIGNED_GFX12: liveins: $vgpr0 ; UNALIGNED_GFX12-NEXT: {{ $}} ; UNALIGNED_GFX12-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0 - ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s32>), align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>) - ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[COPY]](p5) :: (load (s16), addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 ; UNALIGNED_GFX12-NEXT: [[PTR_ADD:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C]](s32) - ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[PTR_ADD]](p5) :: (load (<3 x s32>) from unknown-address + 12, align 2, addrspace 5) - ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD1]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s16) from unknown-address + 2, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; UNALIGNED_GFX12-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]] + ; UNALIGNED_GFX12-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p5) :: (load (s16) from unknown-address + 4, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD1]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s16) from unknown-address + 6, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD1]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]] + ; UNALIGNED_GFX12-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD3]](p5) :: (load (s16) from unknown-address + 8, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD3]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD4]](p5) :: (load (s16) from unknown-address + 10, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LOAD2]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[ZEXTLOAD2]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>) + ; UNALIGNED_GFX12-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 12 + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p5) = G_PTR_ADD [[COPY]], [[C4]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD3:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD5]](p5) :: (load (s16) from unknown-address + 12, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s16) from unknown-address + 14, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LOAD3]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD7:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C2]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD4:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD7]](p5) :: (load (s16) from unknown-address + 16, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD7]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD4:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD8]](p5) :: (load (s16) from unknown-address + 18, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LOAD4]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SHL4]], [[ZEXTLOAD4]] + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s32) + ; UNALIGNED_GFX12-NEXT: [[ZEXTLOAD5:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD9]](p5) :: (load (s16) from unknown-address + 20, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p5) = G_PTR_ADD [[PTR_ADD9]], [[C]](s32) + ; UNALIGNED_GFX12-NEXT: [[LOAD5:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD10]](p5) :: (load (s16) from unknown-address + 22, addrspace 5) + ; UNALIGNED_GFX12-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LOAD5]], [[C1]](s32) + ; UNALIGNED_GFX12-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[ZEXTLOAD5]] + ; UNALIGNED_GFX12-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR3]](s32), [[OR4]](s32), [[OR5]](s32) + ; UNALIGNED_GFX12-NEXT: [[BITCAST1:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR1]](<3 x s32>) ; UNALIGNED_GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY [[BITCAST]](s96) ; UNALIGNED_GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96) ; UNALIGNED_GFX12-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96) diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index ea10547da6ab..3fc5d0d4b279 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -475,8 +475,14 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) -; FLATSCR-NEXT: scratch_load_dword v0, off, s0 -; FLATSCR-NEXT: scratch_load_dword v1, off, s0 offset:2 +; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 offset:2 +; FLATSCR-NEXT: scratch_load_ushort v3, off, s0 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_mov_b32_e32 v1, v0 +; FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 +; FLATSCR-NEXT: s_mov_b32 s0, 0x5040100 +; FLATSCR-NEXT: s_waitcnt vmcnt(1) +; FLATSCR-NEXT: v_perm_b32 v0, v0, v3, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR-NEXT: s_endpgm @@ -537,8 +543,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; FLATSCR_GFX10-NEXT: scratch_store_short off, v0, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: s_clause 0x1 -; FLATSCR_GFX10-NEXT: scratch_load_dword v0, off, s0 -; FLATSCR_GFX10-NEXT: scratch_load_dword v1, off, s0 offset:2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 offset:2 +; FLATSCR_GFX10-NEXT: scratch_load_ushort v3, off, s0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(1) +; FLATSCR_GFX10-NEXT: v_mov_b32_e32 v1, v0 +; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) +; FLATSCR_GFX10-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v1, off, s0 offset:4 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] ; FLATSCR_GFX10-NEXT: s_endpgm @@ -561,8 +572,13 @@ define amdgpu_kernel void @vload2_private(ptr addrspace(1) nocapture readonly %i ; GFX11-NEXT: scratch_store_b16 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: scratch_load_b32 v0, off, off -; GFX11-NEXT: scratch_load_b32 v1, off, off offset:2 +; GFX11-NEXT: scratch_load_u16 v0, off, off offset:2 +; GFX11-NEXT: scratch_load_u16 v3, off, off +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_perm_b32 v0, v0, v3, 0x5040100 +; GFX11-NEXT: scratch_load_d16_hi_b16 v1, off, off offset:4 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[2:3] ; GFX11-NEXT: s_nop 0 diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll index 0ad53083d0ff..12593e3760fd 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll @@ -123,10 +123,8 @@ define amdgpu_kernel void @zextload_flat_i16(ptr addrspace(1) noalias %out, ptr } ; GCN-LABEL: flat_scratch_unaligned_load: -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} -; GCN: flat_load_{{ubyte|u8}} +; GFX9: flat_load_dword +; GFX10PLUS: flat_load_{{dword|b32}} define amdgpu_kernel void @flat_scratch_unaligned_load() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast ptr addrspace(5) %scratch to ptr @@ -136,10 +134,8 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() { } ; GCN-LABEL: flat_scratch_unaligned_store: -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} -; GCN: flat_store_{{byte|b8}} +; GFX9: flat_store_dword +; GFX10PLUS: flat_store_{{dword|b32}} define amdgpu_kernel void @flat_scratch_unaligned_store() { %scratch = alloca i32, addrspace(5) %fptr = addrspacecast ptr addrspace(5) %scratch to ptr diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll index 1dd18b4228fe..9d43efbdf07b 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll @@ -16,47 +16,18 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: v_mov_b32_e32 v9, s7 ; CHECK-NEXT: v_mov_b32_e32 v8, s6 -; CHECK-NEXT: flat_load_ubyte v10, v[8:9] offset:5 -; CHECK-NEXT: flat_load_ubyte v11, v[8:9] offset:6 -; CHECK-NEXT: flat_load_ubyte v12, v[8:9] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[8:9] offset:3 -; CHECK-NEXT: flat_load_ubyte v14, v[8:9] offset:2 -; CHECK-NEXT: flat_load_ubyte v15, v[8:9] offset:1 -; CHECK-NEXT: flat_load_ubyte v16, v[8:9] -; CHECK-NEXT: flat_load_ubyte v17, v[8:9] offset:4 -; CHECK-NEXT: flat_load_ubyte v18, v[8:9] offset:13 -; CHECK-NEXT: flat_load_ubyte v19, v[8:9] offset:14 -; CHECK-NEXT: flat_load_ubyte v20, v[8:9] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[8:9] offset:11 -; CHECK-NEXT: flat_load_ubyte v22, v[8:9] offset:10 -; CHECK-NEXT: flat_load_ubyte v23, v[8:9] offset:9 -; CHECK-NEXT: flat_load_ubyte v24, v[8:9] offset:8 -; CHECK-NEXT: flat_load_ubyte v25, v[8:9] offset:12 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; CHECK-NEXT: s_add_u32 s4, s4, 1 ; CHECK-NEXT: s_addc_u32 s5, s5, 0 -; CHECK-NEXT: v_add_co_u32_e32 v8, vcc, s6, v6 +; CHECK-NEXT: v_mov_b32_e32 v13, s7 +; CHECK-NEXT: v_add_co_u32_e32 v12, vcc, s6, v6 ; CHECK-NEXT: v_cmp_ge_u64_e64 s[8:9], s[4:5], 2 -; CHECK-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v7, vcc +; CHECK-NEXT: v_addc_co_u32_e32 v13, vcc, v13, v7, vcc ; CHECK-NEXT: s_add_u32 s6, s6, 16 ; CHECK-NEXT: s_addc_u32 s7, s7, 0 ; CHECK-NEXT: s_and_b64 vcc, exec, s[8:9] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[8:9], v13 offset:3 -; CHECK-NEXT: flat_store_byte v[8:9], v14 offset:2 -; CHECK-NEXT: flat_store_byte v[8:9], v15 offset:1 -; CHECK-NEXT: flat_store_byte v[8:9], v16 -; CHECK-NEXT: flat_store_byte v[8:9], v12 offset:7 -; CHECK-NEXT: flat_store_byte v[8:9], v11 offset:6 -; CHECK-NEXT: flat_store_byte v[8:9], v10 offset:5 -; CHECK-NEXT: flat_store_byte v[8:9], v17 offset:4 -; CHECK-NEXT: flat_store_byte v[8:9], v21 offset:11 -; CHECK-NEXT: flat_store_byte v[8:9], v22 offset:10 -; CHECK-NEXT: flat_store_byte v[8:9], v23 offset:9 -; CHECK-NEXT: flat_store_byte v[8:9], v24 offset:8 -; CHECK-NEXT: flat_store_byte v[8:9], v20 offset:15 -; CHECK-NEXT: flat_store_byte v[8:9], v19 offset:14 -; CHECK-NEXT: flat_store_byte v[8:9], v18 offset:13 -; CHECK-NEXT: flat_store_byte v[8:9], v25 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; CHECK-NEXT: s_cbranch_vccz .LBB0_2 ; CHECK-NEXT: ; %bb.3: ; %loop-memcpy-residual-header ; CHECK-NEXT: s_mov_b32 s4, 0 @@ -128,47 +99,18 @@ define void @issue63986(i64 %0, i64 %idxprom) { ; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 ; CHECK-NEXT: v_mov_b32_e32 v10, s10 ; CHECK-NEXT: v_mov_b32_e32 v11, s11 -; CHECK-NEXT: flat_load_ubyte v12, v[10:11] offset:5 -; CHECK-NEXT: flat_load_ubyte v13, v[10:11] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[10:11] offset:7 -; CHECK-NEXT: flat_load_ubyte v15, v[10:11] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[10:11] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[10:11] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[10:11] -; CHECK-NEXT: flat_load_ubyte v19, v[10:11] offset:4 -; CHECK-NEXT: flat_load_ubyte v20, v[10:11] offset:13 -; CHECK-NEXT: flat_load_ubyte v21, v[10:11] offset:14 -; CHECK-NEXT: flat_load_ubyte v22, v[10:11] offset:15 -; CHECK-NEXT: flat_load_ubyte v23, v[10:11] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[10:11] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[10:11] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[10:11] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[10:11] offset:12 +; CHECK-NEXT: flat_load_dwordx4 v[10:13], v[10:11] +; CHECK-NEXT: v_mov_b32_e32 v15, s11 ; CHECK-NEXT: s_add_u32 s14, s14, 1 -; CHECK-NEXT: v_add_co_u32_e32 v10, vcc, s10, v2 -; CHECK-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v3, vcc +; CHECK-NEXT: v_add_co_u32_e32 v14, vcc, s10, v2 +; CHECK-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v3, vcc ; CHECK-NEXT: s_addc_u32 s15, s15, 0 ; CHECK-NEXT: s_add_u32 s10, s10, 16 ; CHECK-NEXT: v_cmp_ge_u64_e32 vcc, s[14:15], v[4:5] ; CHECK-NEXT: s_addc_u32 s11, s11, 0 ; CHECK-NEXT: s_or_b64 s[12:13], vcc, s[12:13] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[10:11], v15 offset:3 -; CHECK-NEXT: flat_store_byte v[10:11], v16 offset:2 -; CHECK-NEXT: flat_store_byte v[10:11], v17 offset:1 -; CHECK-NEXT: flat_store_byte v[10:11], v18 -; CHECK-NEXT: flat_store_byte v[10:11], v14 offset:7 -; CHECK-NEXT: flat_store_byte v[10:11], v13 offset:6 -; CHECK-NEXT: flat_store_byte v[10:11], v12 offset:5 -; CHECK-NEXT: flat_store_byte v[10:11], v19 offset:4 -; CHECK-NEXT: flat_store_byte v[10:11], v23 offset:11 -; CHECK-NEXT: flat_store_byte v[10:11], v24 offset:10 -; CHECK-NEXT: flat_store_byte v[10:11], v25 offset:9 -; CHECK-NEXT: flat_store_byte v[10:11], v26 offset:8 -; CHECK-NEXT: flat_store_byte v[10:11], v22 offset:15 -; CHECK-NEXT: flat_store_byte v[10:11], v21 offset:14 -; CHECK-NEXT: flat_store_byte v[10:11], v20 offset:13 -; CHECK-NEXT: flat_store_byte v[10:11], v27 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[14:15], v[10:13] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[12:13] ; CHECK-NEXT: s_cbranch_execnz .LBB0_14 ; CHECK-NEXT: .LBB0_15: ; %Flow20 @@ -251,23 +193,11 @@ define void @issue63986_reduced_expanded(i64 %idxprom) { ; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: .LBB1_8: ; %post-loop-memcpy-expansion ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v5, v2 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v2 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:12 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: .LBB1_9: ; %loop-memcpy-expansion2 ; CHECK-NEXT: s_mov_b64 vcc, vcc ; CHECK-NEXT: s_cbranch_vccz .LBB1_9 diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll index 0a76e169e9c3..8c28fac0d839 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll @@ -10,108 +10,21 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v12, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s2 +; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 +; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 +; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] +; CHECK-NEXT: v_mov_b32_e32 v12, s1 +; CHECK-NEXT: v_mov_b32_e32 v11, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16 -; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32 -; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31 +; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 +; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 +; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -185,375 +98,59 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 +; CHECK-NEXT: v_mov_b32_e32 v25, s2 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 +; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 +; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 -; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -569,363 +166,57 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: v_mov_b32_e32 v26, s0 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, s1 +; CHECK-NEXT: v_mov_b32_e32 v24, s0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -972,279 +263,27 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 { ; CHECK-LABEL: memcpy_p0_p3_minsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:114 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:115 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:116 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:121 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:122 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:123 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:124 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:125 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:126 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:127 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:96 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:97 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:98 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:100 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:101 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:102 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:103 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:104 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:107 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:108 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:109 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:110 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:111 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:81 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:82 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:83 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:84 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:85 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:86 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:87 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:88 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:89 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:90 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:91 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:92 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:93 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:94 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:95 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:64 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:67 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:68 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:69 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:70 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:71 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:72 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:73 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:74 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:75 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:76 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:77 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:78 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:79 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:48 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:49 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:50 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:51 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:52 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:53 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:54 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:55 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:56 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:57 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:58 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:60 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:61 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:62 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:63 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:32 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:33 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:34 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:35 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:36 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:37 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:38 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:41 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:42 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:43 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:44 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:45 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:46 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:47 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47 -; CHECK-NEXT: ds_read_u8 v3, v2 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v3 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v21, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s0 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 +; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) @@ -1256,108 +295,21 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14 -; CHECK-NEXT: v_mov_b32_e32 v3, s1 -; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: v_mov_b32_e32 v12, s3 +; CHECK-NEXT: v_mov_b32_e32 v11, s2 +; CHECK-NEXT: flat_load_ubyte v13, v[11:12] offset:46 +; CHECK-NEXT: flat_load_ushort v14, v[11:12] offset:44 +; CHECK-NEXT: flat_load_dwordx3 v[8:10], v[11:12] offset:32 +; CHECK-NEXT: flat_load_dwordx4 v[0:3], v[11:12] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[11:12] +; CHECK-NEXT: v_mov_b32_e32 v12, s1 +; CHECK-NEXT: v_mov_b32_e32 v11, s0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16 -; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46 -; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45 -; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44 -; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43 -; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42 -; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41 -; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40 -; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39 -; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38 -; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37 -; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36 -; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35 -; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34 -; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33 -; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46 -; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45 -; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44 -; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43 -; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42 -; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41 -; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40 -; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39 -; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38 -; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37 -; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36 -; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35 -; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34 -; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33 -; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32 -; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31 +; CHECK-NEXT: flat_store_byte v[11:12], v13 offset:46 +; CHECK-NEXT: flat_store_short v[11:12], v14 offset:44 +; CHECK-NEXT: flat_store_dwordx3 v[11:12], v[8:10] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[0:3] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[11:12], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false) @@ -1431,375 +383,59 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add ; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1] ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x8 ; CHECK-NEXT: s_load_dword s2, s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v24, 0 ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80 +; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64 +; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48 +; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, s2 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:10 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23 +; CHECK-NEXT: v_mov_b32_e32 v25, s2 +; CHECK-NEXT: s_waitcnt vmcnt(5) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(9) +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] +; CHECK-NEXT: s_waitcnt vmcnt(13) +; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt vmcnt(16) +; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76 +; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:9 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:8 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:7 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20 +; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48 ; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:6 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:5 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:2 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:4 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:3 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:23 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:22 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:21 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:20 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:19 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:29 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:18 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:16 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:27 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:26 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:25 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:24 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:44 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:43 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:45 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:36 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:35 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:47 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60 -; CHECK-NEXT: s_waitcnt vmcnt(33) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:34 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:28 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:42 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:33 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:32 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:61 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:40 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:39 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:38 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:37 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:57 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:56 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:58 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:48 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:46 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:60 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:41 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:55 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:74 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:53 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:52 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:51 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:63 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:50 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:77 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:71 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:70 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:59 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:73 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:54 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:68 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:66 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:65 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:64 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:62 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:76 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:90 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:72 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:87 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:67 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:79 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:95 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:93 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:75 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88 +; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32 +; CHECK-NEXT: s_waitcnt vmcnt(21) +; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:89 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:78 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:94 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:92 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:88 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:91 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:86 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:85 -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:84 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:83 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:82 -; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97 -; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98 -; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99 -; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:81 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:80 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:111 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:110 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:109 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:108 -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:100 -; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121 -; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122 -; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123 -; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124 -; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125 -; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:107 -; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:105 -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:103 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:101 -; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116 -; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117 -; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119 -; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(34) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:104 -; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115 -; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113 -; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v18, v1, s[16:19], 0 offen offset:99 -; CHECK-NEXT: buffer_store_byte v17, v1, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_store_byte v16, v1, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_store_byte v15, v1, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v7, v1, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_store_byte v6, v1, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_store_byte v5, v1, s[16:19], 0 offen offset:125 -; CHECK-NEXT: buffer_store_byte v4, v1, s[16:19], 0 offen offset:124 -; CHECK-NEXT: buffer_store_byte v3, v1, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_store_byte v2, v1, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_store_byte v20, v1, s[16:19], 0 offen offset:121 -; CHECK-NEXT: buffer_store_byte v19, v1, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v11, v1, s[16:19], 0 offen offset:119 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v10, v1, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_store_byte v9, v1, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_store_byte v8, v1, s[16:19], 0 offen offset:116 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v13, v1, s[16:19], 0 offen offset:115 -; CHECK-NEXT: buffer_store_byte v12, v1, s[16:19], 0 offen offset:114 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v1, s[16:19], 0 offen offset:113 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v21, v1, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false) @@ -1815,363 +451,57 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) % ; CHECK-NEXT: s_add_u32 s16, s16, s13 ; CHECK-NEXT: s_addc_u32 s17, s17, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v2, s0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:2 +; CHECK-NEXT: v_mov_b32_e32 v26, s0 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:23 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:22 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:21 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:20 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:19 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:18 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:17 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:47 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:16 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:45 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:37 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:36 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:35 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:34 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:33 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:32 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:29 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:44 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:63 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:42 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:40 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:39 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:38 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:41 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:59 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:51 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:50 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:49 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:48 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:46 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:61 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:43 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:58 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:79 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:56 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:54 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:53 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:52 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:55 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:73 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:65 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:64 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:62 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:77 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:60 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:75 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:57 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:72 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:95 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:70 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:68 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:67 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:66 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:69 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:87 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:111 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:110 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:76 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:91 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:74 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:89 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:71 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:86 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:84 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:83 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:81 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:80 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:78 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:93 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:82 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:101 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:90 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:105 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:88 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:103 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:85 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:100 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:92 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:107 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:104 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:102 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:99 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:94 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:109 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:106 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:108 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[16:19], 0 offen offset:96 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[16:19], 0 offen offset:97 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[16:19], 0 offen offset:98 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[16:19], 0 offen offset:120 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[16:19], 0 offen offset:121 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[16:19], 0 offen offset:122 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[16:19], 0 offen offset:123 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[16:19], 0 offen offset:124 +; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32 +; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36 +; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40 +; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44 +; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48 +; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52 +; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56 +; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60 +; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92 +; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88 +; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84 +; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, s1 +; CHECK-NEXT: v_mov_b32_e32 v24, s0 +; CHECK-NEXT: s_waitcnt vmcnt(20) +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112 +; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76 ; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[16:19], 0 offen offset:126 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[16:19], 0 offen offset:116 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[16:19], 0 offen offset:117 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[16:19], 0 offen offset:118 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[16:19], 0 offen offset:119 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[16:19], 0 offen offset:127 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[16:19], 0 offen offset:114 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[16:19], 0 offen offset:115 +; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72 +; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68 +; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[16:19], 0 offen offset:125 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[16:19], 0 offen offset:113 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[16:19], 0 offen offset:112 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96 +; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[16:19] offset:48 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[12:15] offset:32 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[8:11] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112 +; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false) @@ -2218,279 +548,27 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 { ; CHECK-LABEL: memcpy_p0_p3_optsize: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CHECK-NEXT: v_mov_b32_e32 v2, 0 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:112 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:113 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:114 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:115 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:116 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:117 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:118 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:119 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s0 -; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:120 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:121 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:122 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:123 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:124 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:125 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:126 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:127 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:96 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:97 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:98 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:99 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:100 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:101 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:102 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:103 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:104 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:105 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:106 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:107 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:108 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:109 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:110 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:111 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:80 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:81 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:82 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:83 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:84 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:85 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:86 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:87 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:88 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:89 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:90 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:91 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:92 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:93 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:94 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:95 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:64 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:65 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:66 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:67 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:68 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:69 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:70 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:71 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:72 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:73 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:74 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:75 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:76 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:77 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:78 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:79 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:48 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:49 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:50 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:51 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:52 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:53 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:54 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:55 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:56 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:57 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:58 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:59 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:60 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:61 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:62 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:63 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:32 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:33 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:34 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:35 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:36 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:37 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:38 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:39 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:40 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:41 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:42 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:43 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:44 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:45 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:46 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:47 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47 -; CHECK-NEXT: ds_read_u8 v3, v2 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v3 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: v_mov_b32_e32 v16, 0 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5 +; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v21, s1 +; CHECK-NEXT: v_mov_b32_e32 v20, s0 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:32 +; CHECK-NEXT: ds_read2_b64 v[0:3], v16 offset0:8 offset1:9 +; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:10 offset1:11 +; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:12 offset1:13 +; CHECK-NEXT: ds_read2_b64 v[16:19], v16 offset0:14 offset1:15 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[12:15] offset:48 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[0:3] offset:64 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[4:7] offset:80 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[8:11] offset:96 +; CHECK-NEXT: flat_store_dwordx4 v[20:21], v[16:19] offset:112 ; CHECK-NEXT: s_endpgm entry: tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll index 7575782c1b2a..cadc3dadb0a1 100644 --- a/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memcpy-param-combinations.ll @@ -13,55 +13,9 @@ define void @memcpy_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -73,101 +27,19 @@ define void @memcpy_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -179,104 +51,13 @@ define void @memcpy_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -288,31 +69,9 @@ define void @memcpy_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -324,55 +83,19 @@ define void @memcpy_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -384,55 +107,13 @@ define void @memcpy_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -458,58 +139,13 @@ define void @memcpy_p0_p0_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -553,58 +189,13 @@ define void @memcpy_p0_p0_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(16) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -634,55 +225,9 @@ define void @memcpy_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -694,101 +239,19 @@ define void @memcpy_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -800,104 +263,13 @@ define void @memcpy_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -909,31 +281,9 @@ define void @memcpy_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -945,55 +295,19 @@ define void @memcpy_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1005,55 +319,13 @@ define void @memcpy_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1080,35 +352,12 @@ define void @memcpy_p0_p1_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1153,35 +402,12 @@ define void @memcpy_p0_p1_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_clause 0x1 -; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off -; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:15 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v11 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v11 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v11 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1211,54 +437,9 @@ define void @memcpy_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1270,96 +451,19 @@ define void @memcpy_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v17, v2 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1371,100 +475,12 @@ define void @memcpy_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v18, v2 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:17 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: ds_read_u8 v3, v2 offset:31 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1476,30 +492,9 @@ define void @memcpy_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1511,54 +506,19 @@ define void @memcpy_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1570,54 +530,12 @@ define void @memcpy_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1643,35 +561,12 @@ define void @memcpy_p0_p3_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p3_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset1:1 -; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1714,35 +609,12 @@ define void @memcpy_p0_p3_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p3_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_b128 v[3:6], v2 -; CHECK-NEXT: ds_read_b128 v[7:10], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[3:6], v2 offset:15 +; CHECK-NEXT: ds_read_b128 v[7:10], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v7 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v8 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v9 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v10 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 8, v10 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1771,55 +643,12 @@ define void @memcpy_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1831,100 +660,24 @@ define void @memcpy_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:2 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:3 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:4 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:5 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:6 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:7 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:9 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:10 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:11 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 +; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 +; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1936,104 +689,18 @@ define void @memcpy_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v19 -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2045,30 +712,12 @@ define void @memcpy_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_load_ushort v4, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:2 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:4 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:6 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:8 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:10 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:10 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:12 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 offset:14 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2080,55 +729,24 @@ define void @memcpy_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dword v4, v[2:3], off offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dword v[0:1], v4 offset:24 +; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 +; CHECK-NEXT: global_load_ubyte v2, v[2:3], off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2140,55 +758,18 @@ define void @memcpy_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:8 +; CHECK-NEXT: global_load_dwordx2 v[4:5], v[2:3], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[4:5] offset:16 +; CHECK-NEXT: global_load_dwordx2 v[2:3], v[2:3], off offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx2 v[0:1], v[2:3] offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2219,30 +800,7 @@ define void @memcpy_p0_p4_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2290,30 +848,7 @@ define void @memcpy_p0_p4_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:15 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v3 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:19 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:23 -; CHECK-NEXT: flat_store_byte_d16_hi v[0:1], v5 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 24, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v2 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 8, v3 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 24, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v4 -; CHECK-NEXT: v_lshrrev_b32_e32 v9, 24, v5 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 8, v5 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:16 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:24 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2342,55 +877,13 @@ define void @memcpy_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2402,99 +895,23 @@ define void @memcpy_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v17 -; CHECK-NEXT: s_clause 0xc -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2506,103 +923,19 @@ define void @memcpy_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v18 -; CHECK-NEXT: s_clause 0xd -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2614,31 +947,13 @@ define void @memcpy_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2650,55 +965,23 @@ define void @memcpy_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2710,55 +993,19 @@ define void @memcpy_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2788,53 +1035,19 @@ define void @memcpy_p0_p5_sz31_align_8_8(ptr addrspace(0) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p0_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2888,53 +1101,19 @@ define void @memcpy_p0_p5_sz31_align_16_16(ptr addrspace(0) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p0_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:27 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2970,41 +1149,8 @@ define void @memcpy_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3016,79 +1162,15 @@ define void @memcpy_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v6, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v5, v7, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v22 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v14, v23, 8, v24 -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v30 -; CHECK-NEXT: v_lshl_or_b32 v4, v14, 16, v12 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v2, v33, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v2, v2, 16, v18 ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) ; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3100,79 +1182,13 @@ define void @memcpy_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -3183,23 +1199,8 @@ define void @memcpy_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -3211,52 +1212,16 @@ define void @memcpy_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v20, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v21, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v22, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v23, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v7, v4, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v4, v19, 16, v20 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[8:9], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v3, v21, 16, v22 -; CHECK-NEXT: v_lshl_or_b32 v9, v13, 16, v12 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v2, v23, 16, v2 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -3267,39 +1232,13 @@ define void @memcpy_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -3323,47 +1262,13 @@ define void @memcpy_p1_p0_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false) @@ -3404,47 +1309,13 @@ define void @memcpy_p1_p0_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p1_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v7, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v14, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v19, 8, v20 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false) @@ -4042,44 +1913,13 @@ define void @memcpy_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -4090,81 +1930,21 @@ define void @memcpy_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -4175,81 +1955,19 @@ define void @memcpy_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -4260,24 +1978,13 @@ define void @memcpy_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -4288,52 +1995,21 @@ define void @memcpy_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v19, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v21, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v22, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v15 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v12, v18, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v10 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v20, 16, v19 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[7:8], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v22, 16, v21 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[6:7], off offset:16 -; CHECK-NEXT: global_store_dwordx2 v[0:1], v[8:9], off offset:23 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[9:10], off offset:23 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4344,41 +2020,19 @@ define void @memcpy_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -4406,49 +2060,18 @@ define void @memcpy_p1_p5_sz31_align_8_8(ptr addrspace(1) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p1_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4500,49 +2123,18 @@ define void @memcpy_p1_p5_sz31_align_16_16(ptr addrspace(1) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p1_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v10, v2, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v20, 8, v19 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4577,41 +2169,8 @@ define void @memcpy_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4624,80 +2183,16 @@ define void @memcpy_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v8 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v22, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v11, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v29 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v32, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v10, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:23 -; CHECK-NEXT: ds_write_b64 v0, v[3:4] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4709,79 +2204,13 @@ define void @memcpy_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4793,23 +2222,8 @@ define void @memcpy_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4822,51 +2236,16 @@ define void @memcpy_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v19, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v20, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v21, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v22, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v11, v5, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v17, 16, v18 +; CHECK-NEXT: s_clause 0x2 +; CHECK-NEXT: flat_load_dwordx2 v[5:6], v[1:2] offset:23 +; CHECK-NEXT: flat_load_dwordx2 v[7:8], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v20 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v22 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:23 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(2) +; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(2) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4878,40 +2257,13 @@ define void @memcpy_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4937,47 +2289,13 @@ define void @memcpy_p3_p0_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15 +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write_b128 v0, v[7:10] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5021,47 +2339,13 @@ define void @memcpy_p3_p0_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p3_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:17 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v7, v6, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v13, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v14, v18, 8, v19 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v11 -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_write_b128 v0, v[1:4] -; CHECK-NEXT: ds_write_b128 v0, v[5:8] offset:15 +; CHECK-NEXT: ds_write_b128 v0, v[3:6] offset:15 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write_b128 v0, v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5701,44 +2985,13 @@ define void @memcpy_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5750,81 +3003,21 @@ define void @memcpy_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_waitcnt vmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5836,81 +3029,19 @@ define void @memcpy_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5922,24 +3053,13 @@ define void @memcpy_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5951,52 +3071,21 @@ define void @memcpy_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v18, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v20, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v21, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v10, v15, 8, v14 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v11, v17, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v5, v21, 16, v20 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 -; CHECK-NEXT: ds_write_b64 v0, v[7:8] offset:23 +; CHECK-NEXT: ds_write_b64 v0, v[8:9] offset:23 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6008,41 +3097,19 @@ define void @memcpy_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6072,50 +3139,19 @@ define void @memcpy_p3_p5_sz31_align_8_8(ptr addrspace(3) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p3_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v4, v1, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v1, v13, 16, v12 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset1:1 -; CHECK-NEXT: ds_write_b128 v0, v[1:4] offset:15 +; CHECK-NEXT: ds_write_b128 v0, v[2:5] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -6169,49 +3205,18 @@ define void @memcpy_p3_p5_sz31_align_16_16(ptr addrspace(3) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p3_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x7 ; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen ; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 ; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v1, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v10, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v11, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v9, v1, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v18 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[2:5] +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: ds_write_b128 v0, v[6:9] offset:15 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -6248,55 +3253,12 @@ define void @memcpy_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6307,101 +3269,24 @@ define void @memcpy_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xe -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6412,104 +3297,19 @@ define void @memcpy_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6520,31 +3320,12 @@ define void @memcpy_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6555,55 +3336,24 @@ define void @memcpy_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6614,55 +3364,19 @@ define void @memcpy_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -6689,61 +3403,19 @@ define void @memcpy_p5_p0_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p0_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(0) noundef nonnull align 8 %src, i64 31, i1 false) @@ -6793,61 +3465,19 @@ define void @memcpy_p5_p0_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p5_p0_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x10 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:28 -; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:15 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen -; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p0.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(0) noundef nonnull align 16 %src, i64 31, i1 false) @@ -6881,55 +3511,12 @@ define void @memcpy_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6940,207 +3527,47 @@ define void @memcpy_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) ret void -} - -define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { -; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18 +} + +define void @memcpy_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addrspace(1) align 1 readonly %src) { +; CHECK-LABEL: memcpy_p5_p1_sz32_align_1_1: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7151,31 +3578,12 @@ define void @memcpy_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7186,55 +3594,24 @@ define void @memcpy_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7245,55 +3622,19 @@ define void @memcpy_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7329,30 +3670,10 @@ define void @memcpy_p5_p1_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(1) noundef nonnull align 8 %src, i64 31, i1 false) @@ -7411,30 +3732,10 @@ define void @memcpy_p5_p1_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(1) noundef nonnull align 16 %src, i64 31, i1 false) @@ -7468,54 +3769,12 @@ define void @memcpy_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7526,85 +3785,25 @@ define void @memcpy_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7615,79 +3814,18 @@ define void @memcpy_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v10, v1 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:31 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:23 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7698,30 +3836,12 @@ define void @memcpy_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v1, v1 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7732,54 +3852,25 @@ define void @memcpy_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7790,54 +3881,18 @@ define void @memcpy_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7872,30 +3927,10 @@ define void @memcpy_p5_p3_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(3) noundef nonnull align 8 %src, i64 31, i1 false) @@ -7952,30 +3987,10 @@ define void @memcpy_p5_p3_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v6, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v6 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v9 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(3) noundef nonnull align 16 %src, i64 31, i1 false) @@ -8008,55 +4023,12 @@ define void @memcpy_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8067,100 +4039,24 @@ define void @memcpy_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8171,103 +4067,19 @@ define void @memcpy_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:17 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8278,31 +4090,12 @@ define void @memcpy_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8313,55 +4106,24 @@ define void @memcpy_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8372,55 +4134,19 @@ define void @memcpy_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false) @@ -8456,30 +4182,10 @@ define void @memcpy_p5_p4_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(4) noundef nonnull align 8 %src, i64 31, i1 false) @@ -8538,30 +4244,10 @@ define void @memcpy_p5_p4_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte_d16_hi v7, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte_d16_hi v8, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte_d16_hi v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte_d16_hi v10, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: v_lshrrev_b32_e32 v1, 24, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v2, 8, v7 -; CHECK-NEXT: v_lshrrev_b32_e32 v3, 24, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v4, 8, v8 -; CHECK-NEXT: v_lshrrev_b32_e32 v5, 24, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v6, 8, v9 -; CHECK-NEXT: v_lshrrev_b32_e32 v7, 24, v10 -; CHECK-NEXT: v_lshrrev_b32_e32 v8, 8, v10 -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(4) noundef nonnull align 16 %src, i64 31, i1 false) @@ -8595,55 +4281,19 @@ define void @memcpy_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8654,99 +4304,34 @@ define void @memcpy_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xc -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8757,103 +4342,31 @@ define void @memcpy_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x11 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen -; CHECK-NEXT: s_clause 0xd -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8864,31 +4377,19 @@ define void @memcpy_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8899,55 +4400,34 @@ define void @memcpy_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8958,55 +4438,31 @@ define void @memcpy_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -9040,67 +4496,31 @@ define void @memcpy_p5_p5_sz31_align_8_8(ptr addrspace(5) align 8 %dst, ptr addr ; CHECK-LABEL: memcpy_p5_p5_sz31_align_8_8: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 8 %dst, ptr addrspace(5) noundef nonnull align 8 %src, i64 31, i1 false) @@ -9169,67 +4589,31 @@ define void @memcpy_p5_p5_sz31_align_16_16(ptr addrspace(5) align 16 %dst, ptr a ; CHECK-LABEL: memcpy_p5_p5_sz31_align_16_16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x13 -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_dword v16, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_dword v17, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_dword v18, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_dword v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:15 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:19 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:23 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:29 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:15 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:19 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_dword v16, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:23 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_dword v17, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:27 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_dword v18, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_dword v19, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:27 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) noundef nonnull align 16 %dst, ptr addrspace(5) noundef nonnull align 16 %src, i64 31, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll index cc5256620bfe..4e5688adcd6b 100644 --- a/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll +++ b/llvm/test/CodeGen/AMDGPU/memmove-param-combinations.ll @@ -13,55 +13,9 @@ define void @memmove_p0_p0_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -73,100 +27,19 @@ define void @memmove_p0_p0_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -178,103 +51,13 @@ define void @memmove_p0_p0_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -286,31 +69,9 @@ define void @memmove_p0_p0_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -322,55 +83,19 @@ define void @memmove_p0_p0_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -382,55 +107,13 @@ define void @memmove_p0_p0_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v2, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -559,55 +242,9 @@ define void @memmove_p0_p1_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -619,100 +256,19 @@ define void @memmove_p0_p1_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -724,103 +280,13 @@ define void @memmove_p0_p1_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -832,31 +298,9 @@ define void @memmove_p0_p1_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -868,55 +312,19 @@ define void @memmove_p0_p1_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -928,55 +336,13 @@ define void @memmove_p0_p1_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1105,54 +471,9 @@ define void @memmove_p0_p3_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:7 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1164,72 +485,19 @@ define void @memmove_p0_p3_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:29 ; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v26, v2 -; CHECK-NEXT: ds_read_u8 v27, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v28, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v29, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v30, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v31, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v32, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) ; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v26 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1241,74 +509,12 @@ define void @memmove_p0_p3_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:24 -; CHECK-NEXT: ds_read_u8 v4, v2 offset:25 -; CHECK-NEXT: ds_read_u8 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u8 v6, v2 offset:27 -; CHECK-NEXT: ds_read_u8 v7, v2 offset:28 -; CHECK-NEXT: ds_read_u8 v8, v2 offset:29 -; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 -; CHECK-NEXT: ds_read_u8 v10, v2 offset:31 -; CHECK-NEXT: ds_read_u8 v11, v2 offset:16 -; CHECK-NEXT: ds_read_u8 v12, v2 offset:17 -; CHECK-NEXT: ds_read_u8 v13, v2 offset:18 -; CHECK-NEXT: ds_read_u8 v14, v2 offset:19 -; CHECK-NEXT: ds_read_u8 v15, v2 offset:20 -; CHECK-NEXT: ds_read_u8 v16, v2 offset:21 -; CHECK-NEXT: ds_read_u8 v17, v2 offset:22 -; CHECK-NEXT: ds_read_u8 v18, v2 offset:23 -; CHECK-NEXT: ds_read_u8 v19, v2 offset:8 -; CHECK-NEXT: ds_read_u8 v20, v2 offset:9 -; CHECK-NEXT: ds_read_u8 v21, v2 offset:10 -; CHECK-NEXT: ds_read_u8 v22, v2 offset:11 -; CHECK-NEXT: ds_read_u8 v23, v2 offset:12 -; CHECK-NEXT: ds_read_u8 v24, v2 offset:13 -; CHECK-NEXT: ds_read_u8 v25, v2 offset:14 -; CHECK-NEXT: ds_read_u8 v26, v2 offset:15 -; CHECK-NEXT: ds_read_u8 v27, v2 -; CHECK-NEXT: ds_read_u8 v28, v2 offset:1 -; CHECK-NEXT: ds_read_u8 v29, v2 offset:2 -; CHECK-NEXT: ds_read_u8 v30, v2 offset:3 -; CHECK-NEXT: ds_read_u8 v31, v2 offset:4 -; CHECK-NEXT: ds_read_u8 v32, v2 offset:5 -; CHECK-NEXT: ds_read_u8 v33, v2 offset:6 -; CHECK-NEXT: ds_read_u8 v2, v2 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:31 -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:29 -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26 -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25 -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:23 -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:22 -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:21 -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:18 -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:17 -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:15 -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:14 -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:13 -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11 -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:10 -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:9 -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7 -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:6 -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:5 -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3 -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:2 -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:1 -; CHECK-NEXT: flat_store_byte v[0:1], v27 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1320,30 +526,9 @@ define void @memmove_p0_p3_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1355,54 +540,19 @@ define void @memmove_p0_p3_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read_u8 v9, v2 offset:30 +; CHECK-NEXT: ds_read_b32 v8, v2 offset:24 +; CHECK-NEXT: ds_read_u16 v10, v2 offset:28 +; CHECK-NEXT: ds_read_b64 v[6:7], v2 offset:16 +; CHECK-NEXT: ds_read2_b64 v[2:5], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1414,54 +564,12 @@ define void @memmove_p0_p3_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v3, v2 offset:30 -; CHECK-NEXT: ds_read_u16 v4, v2 offset:28 -; CHECK-NEXT: ds_read_u16 v5, v2 offset:26 -; CHECK-NEXT: ds_read_u16 v6, v2 offset:24 -; CHECK-NEXT: ds_read_u16 v7, v2 offset:22 -; CHECK-NEXT: ds_read_u16 v8, v2 offset:20 -; CHECK-NEXT: ds_read_u16 v9, v2 offset:18 -; CHECK-NEXT: ds_read_u16 v10, v2 offset:16 -; CHECK-NEXT: ds_read_u16 v11, v2 offset:14 -; CHECK-NEXT: ds_read_u16 v12, v2 offset:12 -; CHECK-NEXT: ds_read_u16 v13, v2 offset:10 -; CHECK-NEXT: ds_read_u16 v14, v2 offset:8 -; CHECK-NEXT: ds_read_u16 v15, v2 offset:6 -; CHECK-NEXT: ds_read_u16 v16, v2 offset:4 -; CHECK-NEXT: ds_read_u16 v17, v2 offset:2 -; CHECK-NEXT: ds_read_u16 v2, v2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: ds_read2_b64 v[3:6], v2 offset0:2 offset1:3 +; CHECK-NEXT: ds_read2_b64 v[7:10], v2 offset1:1 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1583,55 +691,9 @@ define void @memmove_p0_p4_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:1 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1643,100 +705,19 @@ define void @memmove_p0_p4_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1748,103 +729,13 @@ define void @memmove_p0_p4_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:31 -; CHECK-NEXT: global_load_ubyte v5, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ubyte v6, v[2:3], off offset:29 -; CHECK-NEXT: global_load_ubyte v7, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ubyte v8, v[2:3], off offset:27 -; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ubyte v10, v[2:3], off offset:25 -; CHECK-NEXT: global_load_ubyte v11, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ubyte v12, v[2:3], off offset:23 -; CHECK-NEXT: global_load_ubyte v13, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ubyte v14, v[2:3], off offset:21 -; CHECK-NEXT: global_load_ubyte v15, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ubyte v16, v[2:3], off offset:19 -; CHECK-NEXT: global_load_ubyte v17, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ubyte v18, v[2:3], off offset:17 -; CHECK-NEXT: global_load_ubyte v19, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ubyte v20, v[2:3], off offset:15 -; CHECK-NEXT: global_load_ubyte v21, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ubyte v22, v[2:3], off offset:13 -; CHECK-NEXT: global_load_ubyte v23, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ubyte v24, v[2:3], off offset:11 -; CHECK-NEXT: global_load_ubyte v25, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ubyte v26, v[2:3], off offset:9 -; CHECK-NEXT: global_load_ubyte v27, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ubyte v28, v[2:3], off offset:7 -; CHECK-NEXT: global_load_ubyte v29, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ubyte v30, v[2:3], off offset:5 -; CHECK-NEXT: global_load_ubyte v31, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ubyte v32, v[2:3], off offset:3 -; CHECK-NEXT: global_load_ubyte v33, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ubyte v34, v[2:3], off offset:1 -; CHECK-NEXT: global_load_ubyte v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v34 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1856,31 +747,9 @@ define void @memmove_p0_p4_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:2 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1892,55 +761,19 @@ define void @memmove_p0_p4_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_ubyte v9, v[2:3], off offset:30 +; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:28 +; CHECK-NEXT: global_load_dwordx3 v[6:8], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[2:5], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 +; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[6:8] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[2:5] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -1952,55 +785,13 @@ define void @memmove_p0_p4_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v4, v[2:3], off offset:30 -; CHECK-NEXT: global_load_ushort v5, v[2:3], off offset:28 -; CHECK-NEXT: global_load_ushort v6, v[2:3], off offset:26 -; CHECK-NEXT: global_load_ushort v7, v[2:3], off offset:24 -; CHECK-NEXT: global_load_ushort v8, v[2:3], off offset:22 -; CHECK-NEXT: global_load_ushort v9, v[2:3], off offset:20 -; CHECK-NEXT: global_load_ushort v10, v[2:3], off offset:18 -; CHECK-NEXT: global_load_ushort v11, v[2:3], off offset:16 -; CHECK-NEXT: global_load_ushort v12, v[2:3], off offset:14 -; CHECK-NEXT: global_load_ushort v13, v[2:3], off offset:12 -; CHECK-NEXT: global_load_ushort v14, v[2:3], off offset:10 -; CHECK-NEXT: global_load_ushort v15, v[2:3], off offset:8 -; CHECK-NEXT: global_load_ushort v16, v[2:3], off offset:6 -; CHECK-NEXT: global_load_ushort v17, v[2:3], off offset:4 -; CHECK-NEXT: global_load_ushort v18, v[2:3], off offset:2 -; CHECK-NEXT: global_load_ushort v2, v[2:3], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[4:7], v[2:3], off offset:16 +; CHECK-NEXT: global_load_dwordx4 v[8:11], v[2:3], off ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v18 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[4:7] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[8:11] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2129,55 +920,13 @@ define void @memmove_p0_p5_sz16_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2189,100 +938,23 @@ define void @memmove_p0_p5_sz31_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:6 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:5 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:1 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2294,103 +966,19 @@ define void @memmove_p0_p5_sz32_align_1_1(ptr addrspace(0) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_byte v[0:1], v27 offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_byte v[0:1], v28 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_byte v[0:1], v29 offset:5 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_byte v[0:1], v30 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_byte v[0:1], v31 offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_byte v[0:1], v32 offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_byte v[0:1], v33 offset:1 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_byte v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2402,31 +990,13 @@ define void @memmove_p0_p5_sz16_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:2 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2438,55 +1008,23 @@ define void @memmove_p0_p5_sz31_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: flat_store_short v[0:1], v10 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:30 +; CHECK-NEXT: flat_store_dwordx3 v[0:1], v[7:9] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2498,55 +1036,19 @@ define void @memmove_p0_p5_sz32_align_2_2(ptr addrspace(0) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p0_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v2, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: flat_store_short v[0:1], v3 offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: flat_store_short v[0:1], v4 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: flat_store_short v[0:1], v5 offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: flat_store_short v[0:1], v6 offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: flat_store_short v[0:1], v7 offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: flat_store_short v[0:1], v8 offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: flat_store_short v[0:1], v9 offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: flat_store_short v[0:1], v10 offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: flat_store_short v[0:1], v11 offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: flat_store_short v[0:1], v12 offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: flat_store_short v[0:1], v13 offset:10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: flat_store_short v[0:1], v14 offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: flat_store_short v[0:1], v15 offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: flat_store_short v[0:1], v16 offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: flat_store_short v[0:1], v17 offset:2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[3:6] offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: flat_store_short v[0:1], v2 +; CHECK-NEXT: flat_store_dwordx4 v[0:1], v[7:10] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2698,41 +1200,8 @@ define void @memmove_p1_p0_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v8, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v18, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2744,79 +1213,18 @@ define void @memmove_p1_p0_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v9, v8, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v10, v10, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshlrev_b16 v12, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v3, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v16, v6, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v15, v18, 8, v19 -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: v_lshl_or_b32 v8, v14, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v18, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v17, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v6, v15, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v20, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v19, v28, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v4, v18, 16, v17 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v21, v30, 8, v31 +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v22, v32, 8, v33 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v12, v12, v2 -; CHECK-NEXT: v_lshl_or_b32 v3, v20, 16, v19 -; CHECK-NEXT: v_lshl_or_b32 v2, v22, 16, v21 -; CHECK-NEXT: global_store_byte v[0:1], v13, off offset:30 -; CHECK-NEXT: global_store_short v[0:1], v12, off offset:28 ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2828,79 +1236,13 @@ define void @memmove_p1_p0_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v4, v[2:3] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ubyte v6, v[2:3] offset:31 -; CHECK-NEXT: flat_load_ubyte v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ubyte v8, v[2:3] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ubyte v10, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ubyte v11, v[2:3] offset:27 -; CHECK-NEXT: flat_load_ubyte v12, v[2:3] offset:15 -; CHECK-NEXT: flat_load_ubyte v13, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ubyte v14, v[2:3] offset:13 -; CHECK-NEXT: flat_load_ubyte v15, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ubyte v16, v[2:3] offset:23 -; CHECK-NEXT: flat_load_ubyte v17, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ubyte v18, v[2:3] offset:21 -; CHECK-NEXT: flat_load_ubyte v19, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:19 -; CHECK-NEXT: flat_load_ubyte v21, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ubyte v22, v[2:3] offset:17 -; CHECK-NEXT: flat_load_ubyte v23, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ubyte v24, v[2:3] offset:11 -; CHECK-NEXT: flat_load_ubyte v25, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ubyte v26, v[2:3] offset:9 -; CHECK-NEXT: flat_load_ubyte v27, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ubyte v28, v[2:3] offset:7 -; CHECK-NEXT: flat_load_ubyte v29, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ubyte v30, v[2:3] offset:5 -; CHECK-NEXT: flat_load_ubyte v31, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ubyte v32, v[2:3] offset:1 -; CHECK-NEXT: flat_load_ubyte v33, v[2:3] -; CHECK-NEXT: flat_load_ubyte v34, v[2:3] offset:3 -; CHECK-NEXT: flat_load_ubyte v2, v[2:3] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 8, v5 -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v12, v12, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v14, v14, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v20, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v22, 8, v23 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v15, v24, 8, v25 -; CHECK-NEXT: v_lshl_or_b32 v9, v12, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v26, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v17, v28, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v30, 8, v31 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v33 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v34, 8, v2 -; CHECK-NEXT: v_lshl_or_b32 v2, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v6, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -2911,23 +1253,8 @@ define void @memmove_p1_p0_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v4, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v6 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v7 ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2939,41 +1266,18 @@ define void @memmove_p1_p0_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: flat_load_ubyte v20, v[2:3] offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v8, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v7, v12, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v9, v[2:3] offset:30 +; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[6:8], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[2:5], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v3, v16, 16, v17 +; CHECK-NEXT: global_store_byte v[0:1], v9, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v2, v18, 16, v19 -; CHECK-NEXT: global_store_short v[0:1], v11, off offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v20, off offset:30 ; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -2985,39 +1289,13 @@ define void @memmove_p1_p0_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v4, v[2:3] offset:30 -; CHECK-NEXT: flat_load_ushort v5, v[2:3] offset:28 -; CHECK-NEXT: flat_load_ushort v6, v[2:3] offset:26 -; CHECK-NEXT: flat_load_ushort v7, v[2:3] offset:14 -; CHECK-NEXT: flat_load_ushort v8, v[2:3] offset:12 -; CHECK-NEXT: flat_load_ushort v10, v[2:3] offset:10 -; CHECK-NEXT: flat_load_ushort v11, v[2:3] offset:8 -; CHECK-NEXT: flat_load_ushort v9, v[2:3] offset:24 -; CHECK-NEXT: flat_load_ushort v12, v[2:3] offset:22 -; CHECK-NEXT: flat_load_ushort v13, v[2:3] offset:20 -; CHECK-NEXT: flat_load_ushort v14, v[2:3] offset:18 -; CHECK-NEXT: flat_load_ushort v15, v[2:3] offset:16 -; CHECK-NEXT: flat_load_ushort v16, v[2:3] offset:6 -; CHECK-NEXT: flat_load_ushort v17, v[2:3] offset:4 -; CHECK-NEXT: flat_load_ushort v18, v[2:3] offset:2 -; CHECK-NEXT: flat_load_ushort v19, v[2:3] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v5, v4, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v6, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v9, v7, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v3, v12, 16, v13 -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v7, v16, 16, v17 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[2:3] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[8:11], v[2:3] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[4:7], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v6, v18, 16, v19 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off offset:16 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[8:11], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p0.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -3783,44 +2061,13 @@ define void @memmove_p1_p5_sz16_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v9, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v11, v2, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -3831,82 +2078,24 @@ define void @memmove_p1_p5_sz31_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v2, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshlrev_b16 v4, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: v_lshl_or_b32 v8, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v15, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v9, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v12, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v13, v19, 8, v18 -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v10, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v11, v23, 8, v22 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v15, v26, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v14, v24, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v27 -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_or_b32_e32 v18, v4, v32 -; CHECK-NEXT: v_lshl_or_b32 v4, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v17 +; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v33, off offset:30 -; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -3917,81 +2106,19 @@ define void @memmove_p1_p5_sz32_align_1_1(ptr addrspace(1) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v3, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v4, v2, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v6, v2, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v8, v2, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v10, v2, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v12, v2, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v13, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v14, v2, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v15, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v16, v2, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v17, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v18, v2, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v20, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v21, v2, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v22, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v23, v2, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v24, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v25, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v26, v2, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v27, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v28, v2, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v29, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v30, v2, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v31, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v32, v2, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v33, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v4, v4, 8, v3 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v5, v6, 8, v5 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v7, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v8, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v10, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v11, v14, 8, v13 -; CHECK-NEXT: v_lshl_or_b32 v3, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 8, v15 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v9, v18, 8, v17 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v13, v21, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v15, v23, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v12, v19, 8, v25 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v14, v26, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v16, v28, 8, v27 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v17, v30, 8, v29 -; CHECK-NEXT: v_lshl_or_b32 v7, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v18, v32, 8, v31 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v19, v2, 8, v33 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v5, v9, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 16, v16 -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 16, v18 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -4002,24 +2129,13 @@ define void @memmove_p1_p5_sz16_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v7 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v4, v10, 16, v9 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -4030,43 +2146,24 @@ define void @memmove_p1_p5_sz31_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v19, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v8, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v6, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v7 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v11, v2, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: global_store_short v[0:1], v18, off offset:28 +; CHECK-NEXT: global_store_short v[0:1], v10, off offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: global_store_byte v[0:1], v11, off offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(3) +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_byte v[0:1], v19, off offset:30 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx3 v[0:1], v[6:8], off offset:16 +; CHECK-NEXT: global_store_dwordx3 v[0:1], v[7:9], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -4077,41 +2174,19 @@ define void @memmove_p1_p5_sz32_align_2_2(ptr addrspace(1) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p1_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v4, v2, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v2, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v3, v2, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v6, v2, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v2, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v8, v2, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v9, v2, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v10, v2, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v2, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v12, v2, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v13, v2, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v14, v2, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v15, v2, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v16, v2, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v17, v2, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v18, v2, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v6, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v3, v2, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v4, v2, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v5, v2, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v6, v2, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v7, v2, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v8, v2, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v9, v2, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v10, v2, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v7, v14, 16, v13 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v16, 16, v15 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[3:6], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v8, v18, 16, v17 -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[2:5], off -; CHECK-NEXT: global_store_dwordx4 v[0:1], v[6:9], off offset:16 +; CHECK-NEXT: global_store_dwordx4 v[0:1], v[7:10], off offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p1.p5.i64(ptr addrspace(1) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) @@ -4258,41 +2333,8 @@ define void @memmove_p3_p0_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v7, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 8, v16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v9, v17, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4305,82 +2347,20 @@ define void @memmove_p3_p0_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v5, v5, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v7, v7, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: v_lshl_or_b32 v9, v9, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshlrev_b16 v11, 8, v11 -; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v5 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: v_lshl_or_b32 v2, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: v_lshl_or_b32 v10, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v8, v19, 8, v20 -; CHECK-NEXT: v_lshl_or_b32 v13, v13, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v21, 8, v22 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v6, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v16, v25, 8, v26 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v3, v14, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v17, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v18, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v11, v11, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v10, 16, v8 -; CHECK-NEXT: v_lshl_or_b32 v6, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v5, v18, 16, v17 -; CHECK-NEXT: ds_write_b8 v0, v12 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v13 offset:24 -; CHECK-NEXT: ds_write_b16 v0, v11 offset:28 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: ds_write_b32 v0, v7 offset:24 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4392,79 +2372,13 @@ define void @memmove_p3_p0_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 8, v9 -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 8, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v5, v10, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v11, v11, 8, v12 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v14 -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v7, v15, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v6, v17, 8, v18 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v9, v19, 8, v20 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v21, 8, v22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v13, v23, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v12, v25, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v28 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v29, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v16, v31, 8, v32 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v17, v33, 8, v1 -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 -; CHECK-NEXT: v_lshl_or_b32 v6, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4476,23 +2390,8 @@ define void @memmove_p3_p0_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v1, v8, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v5 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v6 ; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -4505,43 +2404,20 @@ define void @memmove_p3_p0_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v4, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v3, v5, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v8, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v2, v10, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v1, v12, 16, v13 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: v_lshl_or_b32 v6, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: v_lshl_or_b32 v5, v16, 16, v17 -; CHECK-NEXT: ds_write_b16 v0, v7 offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) -; CHECK-NEXT: ds_write_b8 v0, v18 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 -; CHECK-NEXT: ds_write_b64 v0, v[1:2] offset:16 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[3:4] offset1:1 +; CHECK-NEXT: ds_write_b8 v0, v8 offset:30 +; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3) +; CHECK-NEXT: ds_write_b32 v0, v7 offset:24 +; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(4) +; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -4553,40 +2429,13 @@ define void @memmove_p3_p0_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v18, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v2, v3, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v6, v6, 16, v7 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v5, v8, 16, v9 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v3, v13, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v16 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v18 -; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[1:2] offset0:2 offset1:3 -; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[5:6] offset1:1 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[3:4], v[5:6] offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(1) +; CHECK-NEXT: ds_write2_b64 v0, v[7:8], v[9:10] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5398,44 +3247,13 @@ define void @memmove_p3_p5_sz16_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v8, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v9, v15, 8, v14 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v10, v1, 8, v16 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5447,83 +3265,26 @@ define void @memmove_p3_p5_sz31_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: v_lshl_or_b32 v1, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshlrev_b16 v3, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v6, v8, 8, v7 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v7, v10, 8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v4, v12, 8, v11 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v14, 8, v13 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v10, v16, 8, v15 -; CHECK-NEXT: v_lshl_or_b32 v16, v2, 16, v1 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v17 -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v8, v20, 8, v19 -; CHECK-NEXT: v_lshl_or_b32 v1, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v9, v22, 8, v21 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: s_waitcnt vmcnt(8) +; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: v_lshl_or_b32 v12, v23, 8, v27 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v14, v28, 8, v26 -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 +; CHECK-NEXT: s_waitcnt vmcnt(6) +; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v15, v30, 8, v29 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_or_b32_e32 v17, v3, v31 -; CHECK-NEXT: v_lshl_or_b32 v3, v11, 16, v10 -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: v_lshl_or_b32 v5, v14, 16, v15 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b8 v0, v32 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v16 offset:24 -; CHECK-NEXT: ds_write_b16 v0, v17 offset:28 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5535,81 +3296,19 @@ define void @memmove_p3_p5_sz32_align_1_1(ptr addrspace(3) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: v_lshl_or_b32 v3, v3, 8, v2 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: v_lshl_or_b32 v4, v5, 8, v4 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: v_lshl_or_b32 v6, v7, 8, v6 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: v_lshl_or_b32 v7, v9, 8, v8 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: v_lshl_or_b32 v9, v11, 8, v10 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: v_lshl_or_b32 v10, v13, 8, v12 -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 8, v14 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: v_lshl_or_b32 v8, v17, 8, v16 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: v_lshl_or_b32 v12, v20, 8, v19 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: v_lshl_or_b32 v14, v22, 8, v21 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: v_lshl_or_b32 v11, v18, 8, v24 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v13, v25, 8, v23 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v15, v27, 8, v26 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v16, v29, 8, v28 -; CHECK-NEXT: v_lshl_or_b32 v6, v12, 16, v11 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v17, v31, 8, v30 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v18, v1, 8, v32 -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: v_lshl_or_b32 v4, v8, 16, v5 -; CHECK-NEXT: v_lshl_or_b32 v3, v10, 16, v9 -; CHECK-NEXT: v_lshl_or_b32 v5, v13, 16, v14 -; CHECK-NEXT: v_lshl_or_b32 v8, v16, 16, v15 -; CHECK-NEXT: v_lshl_or_b32 v7, v18, 16, v17 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5621,24 +3320,13 @@ define void @memmove_p3_p5_sz16_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v4, v7, 16, v6 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5650,44 +3338,26 @@ define void @memmove_p3_p5_sz31_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v18, v3, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v1, v5, 16, v4 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v2, v7, 16, v6 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:30 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v3, v9, 16, v8 +; CHECK-NEXT: ds_write_b32 v0, v8 offset:24 +; CHECK-NEXT: s_waitcnt vmcnt(7) +; CHECK-NEXT: ds_write_b16 v0, v9 offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v4, v11, 16, v10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 +; CHECK-NEXT: ds_write_b8 v0, v10 offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v5, v15, 16, v14 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: ds_write_b16 v0, v16 offset:28 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b8 v0, v17 offset:30 -; CHECK-NEXT: ds_write_b32 v0, v18 offset:24 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write_b64 v0, v[5:6] offset:16 +; CHECK-NEXT: ds_write_b64 v0, v[6:7] offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5699,41 +3369,19 @@ define void @memmove_p3_p5_sz32_align_2_2(ptr addrspace(3) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p3_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v17, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: v_lshl_or_b32 v1, v4, 16, v3 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: v_lshl_or_b32 v2, v5, 16, v2 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: v_lshl_or_b32 v3, v7, 16, v6 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: v_lshl_or_b32 v4, v9, 16, v8 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: v_lshl_or_b32 v5, v11, 16, v10 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: v_lshl_or_b32 v6, v13, 16, v12 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: v_lshl_or_b32 v8, v15, 16, v14 +; CHECK-NEXT: ds_write2_b64 v0, v[2:3], v[4:5] offset1:1 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_lshl_or_b32 v7, v17, 16, v16 -; CHECK-NEXT: ds_write2_b64 v0, v[1:2], v[3:4] offset1:1 -; CHECK-NEXT: ds_write2_b64 v0, v[5:6], v[7:8] offset0:2 offset1:3 +; CHECK-NEXT: ds_write2_b64 v0, v[6:7], v[8:9] offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -5890,55 +3538,12 @@ define void @memmove_p5_p0_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 16, i1 false) @@ -5949,100 +3554,24 @@ define void @memmove_p5_p0_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6053,103 +3582,19 @@ define void @memmove_p5_p0_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:31 -; CHECK-NEXT: flat_load_ubyte v4, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ubyte v5, v[1:2] offset:29 -; CHECK-NEXT: flat_load_ubyte v6, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ubyte v7, v[1:2] offset:27 -; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ubyte v9, v[1:2] offset:25 -; CHECK-NEXT: flat_load_ubyte v10, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ubyte v11, v[1:2] offset:23 -; CHECK-NEXT: flat_load_ubyte v12, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ubyte v13, v[1:2] offset:21 -; CHECK-NEXT: flat_load_ubyte v14, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ubyte v15, v[1:2] offset:19 -; CHECK-NEXT: flat_load_ubyte v16, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ubyte v17, v[1:2] offset:17 -; CHECK-NEXT: flat_load_ubyte v18, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ubyte v19, v[1:2] offset:15 -; CHECK-NEXT: flat_load_ubyte v20, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ubyte v21, v[1:2] offset:13 -; CHECK-NEXT: flat_load_ubyte v22, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ubyte v23, v[1:2] offset:11 -; CHECK-NEXT: flat_load_ubyte v24, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ubyte v25, v[1:2] offset:9 -; CHECK-NEXT: flat_load_ubyte v26, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ubyte v27, v[1:2] offset:7 -; CHECK-NEXT: flat_load_ubyte v28, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ubyte v29, v[1:2] offset:5 -; CHECK-NEXT: flat_load_ubyte v30, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ubyte v31, v[1:2] offset:3 -; CHECK-NEXT: flat_load_ubyte v32, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ubyte v33, v[1:2] offset:1 -; CHECK-NEXT: flat_load_ubyte v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(31) lgkmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) lgkmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) lgkmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) lgkmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) lgkmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) lgkmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) lgkmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) lgkmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) lgkmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) lgkmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) lgkmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) lgkmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) lgkmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) lgkmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(0) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6160,31 +3605,12 @@ define void @memmove_p5_p0_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6195,55 +3621,24 @@ define void @memmove_p5_p0_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ubyte v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: flat_load_ubyte v8, v[1:2] offset:30 +; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:28 +; CHECK-NEXT: flat_load_dwordx3 v[5:7], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[1:4], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6254,55 +3649,19 @@ define void @memmove_p5_p0_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p0_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: flat_load_ushort v3, v[1:2] offset:30 -; CHECK-NEXT: flat_load_ushort v4, v[1:2] offset:28 -; CHECK-NEXT: flat_load_ushort v5, v[1:2] offset:26 -; CHECK-NEXT: flat_load_ushort v6, v[1:2] offset:24 -; CHECK-NEXT: flat_load_ushort v7, v[1:2] offset:22 -; CHECK-NEXT: flat_load_ushort v8, v[1:2] offset:20 -; CHECK-NEXT: flat_load_ushort v9, v[1:2] offset:18 -; CHECK-NEXT: flat_load_ushort v10, v[1:2] offset:16 -; CHECK-NEXT: flat_load_ushort v11, v[1:2] offset:14 -; CHECK-NEXT: flat_load_ushort v12, v[1:2] offset:12 -; CHECK-NEXT: flat_load_ushort v13, v[1:2] offset:10 -; CHECK-NEXT: flat_load_ushort v14, v[1:2] offset:8 -; CHECK-NEXT: flat_load_ushort v15, v[1:2] offset:6 -; CHECK-NEXT: flat_load_ushort v16, v[1:2] offset:4 -; CHECK-NEXT: flat_load_ushort v17, v[1:2] offset:2 -; CHECK-NEXT: flat_load_ushort v1, v[1:2] -; CHECK-NEXT: s_waitcnt vmcnt(15) lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: flat_load_dwordx4 v[3:6], v[1:2] offset:16 +; CHECK-NEXT: flat_load_dwordx4 v[7:10], v[1:2] ; CHECK-NEXT: s_waitcnt vmcnt(1) lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p0.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(0) noundef nonnull align 2 %src, i64 32, i1 false) @@ -6452,55 +3811,12 @@ define void @memmove_p5_p1_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 16, i1 false) @@ -6511,100 +3827,24 @@ define void @memmove_p5_p1_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 31, i1 false) @@ -6615,103 +3855,19 @@ define void @memmove_p5_p1_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(1) noundef nonnull align 1 %src, i64 32, i1 false) @@ -6722,31 +3878,12 @@ define void @memmove_p5_p1_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 16, i1 false) @@ -6757,55 +3894,24 @@ define void @memmove_p5_p1_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 31, i1 false) @@ -6816,55 +3922,19 @@ define void @memmove_p5_p1_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p1_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p1.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(1) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7009,54 +4079,12 @@ define void @memmove_p5_p3_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:7 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7067,72 +4095,25 @@ define void @memmove_p5_p3_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:23 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v25, v1 -; CHECK-NEXT: ds_read_u8 v26, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v29, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v30, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v31, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 +; CHECK-NEXT: s_waitcnt lgkmcnt(4) +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 +; CHECK-NEXT: s_waitcnt lgkmcnt(3) +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: s_waitcnt lgkmcnt(2) +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7143,74 +4124,18 @@ define void @memmove_p5_p3_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:24 -; CHECK-NEXT: ds_read_u8 v3, v1 offset:25 -; CHECK-NEXT: ds_read_u8 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u8 v5, v1 offset:27 -; CHECK-NEXT: ds_read_u8 v6, v1 offset:28 -; CHECK-NEXT: ds_read_u8 v7, v1 offset:29 -; CHECK-NEXT: ds_read_u8 v8, v1 offset:30 -; CHECK-NEXT: ds_read_u8 v9, v1 offset:31 -; CHECK-NEXT: ds_read_u8 v10, v1 offset:16 -; CHECK-NEXT: ds_read_u8 v11, v1 offset:17 -; CHECK-NEXT: ds_read_u8 v12, v1 offset:18 -; CHECK-NEXT: ds_read_u8 v13, v1 offset:19 -; CHECK-NEXT: ds_read_u8 v14, v1 offset:20 -; CHECK-NEXT: ds_read_u8 v15, v1 offset:21 -; CHECK-NEXT: ds_read_u8 v16, v1 offset:22 -; CHECK-NEXT: ds_read_u8 v17, v1 offset:23 -; CHECK-NEXT: ds_read_u8 v18, v1 offset:8 -; CHECK-NEXT: ds_read_u8 v19, v1 offset:9 -; CHECK-NEXT: ds_read_u8 v20, v1 offset:10 -; CHECK-NEXT: ds_read_u8 v21, v1 offset:11 -; CHECK-NEXT: ds_read_u8 v22, v1 offset:12 -; CHECK-NEXT: ds_read_u8 v23, v1 offset:13 -; CHECK-NEXT: ds_read_u8 v24, v1 offset:14 -; CHECK-NEXT: ds_read_u8 v25, v1 offset:15 -; CHECK-NEXT: ds_read_u8 v26, v1 -; CHECK-NEXT: ds_read_u8 v27, v1 offset:1 -; CHECK-NEXT: ds_read_u8 v28, v1 offset:2 -; CHECK-NEXT: ds_read_u8 v29, v1 offset:3 -; CHECK-NEXT: ds_read_u8 v30, v1 offset:4 -; CHECK-NEXT: ds_read_u8 v31, v1 offset:5 -; CHECK-NEXT: ds_read_u8 v32, v1 offset:6 -; CHECK-NEXT: ds_read_u8 v1, v1 offset:7 -; CHECK-NEXT: s_waitcnt lgkmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 +; CHECK-NEXT: s_waitcnt lgkmcnt(1) +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(3) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7221,30 +4146,12 @@ define void @memmove_p5_p3_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v1, v1 offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: ds_read2_b64 v[1:4], v1 offset1:1 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7255,54 +4162,25 @@ define void @memmove_p5_p3_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u8 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: ds_read_b32 v8, v1 offset:24 +; CHECK-NEXT: ds_read_u16 v9, v1 offset:28 +; CHECK-NEXT: ds_read_u8 v10, v1 offset:30 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read_b64 v[6:7], v1 offset:16 ; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7313,54 +4191,18 @@ define void @memmove_p5_p3_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p3_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: ds_read_u16 v2, v1 offset:30 -; CHECK-NEXT: ds_read_u16 v3, v1 offset:28 -; CHECK-NEXT: ds_read_u16 v4, v1 offset:26 -; CHECK-NEXT: ds_read_u16 v5, v1 offset:24 -; CHECK-NEXT: ds_read_u16 v6, v1 offset:22 -; CHECK-NEXT: ds_read_u16 v7, v1 offset:20 -; CHECK-NEXT: ds_read_u16 v8, v1 offset:18 -; CHECK-NEXT: ds_read_u16 v9, v1 offset:16 -; CHECK-NEXT: ds_read_u16 v10, v1 offset:14 -; CHECK-NEXT: ds_read_u16 v11, v1 offset:12 -; CHECK-NEXT: ds_read_u16 v12, v1 offset:10 -; CHECK-NEXT: ds_read_u16 v13, v1 offset:8 -; CHECK-NEXT: ds_read_u16 v14, v1 offset:6 -; CHECK-NEXT: ds_read_u16 v15, v1 offset:4 -; CHECK-NEXT: ds_read_u16 v16, v1 offset:2 -; CHECK-NEXT: ds_read_u16 v1, v1 -; CHECK-NEXT: s_waitcnt lgkmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt lgkmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt lgkmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt lgkmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt lgkmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt lgkmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt lgkmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt lgkmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt lgkmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt lgkmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt lgkmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt lgkmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt lgkmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt lgkmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: ds_read2_b64 v[2:5], v1 offset1:1 +; CHECK-NEXT: ds_read2_b64 v[6:9], v1 offset0:2 offset1:3 ; CHECK-NEXT: s_waitcnt lgkmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p3.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(3) noundef nonnull align 2 %src, i64 32, i1 false) @@ -7505,55 +4347,12 @@ define void @memmove_p5_p4_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 16, i1 false) @@ -7564,100 +4363,24 @@ define void @memmove_p5_p4_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 31, i1 false) @@ -7668,103 +4391,19 @@ define void @memmove_p5_p4_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:31 -; CHECK-NEXT: global_load_ubyte v4, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ubyte v5, v[1:2], off offset:29 -; CHECK-NEXT: global_load_ubyte v6, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ubyte v7, v[1:2], off offset:27 -; CHECK-NEXT: global_load_ubyte v8, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:25 -; CHECK-NEXT: global_load_ubyte v10, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ubyte v11, v[1:2], off offset:23 -; CHECK-NEXT: global_load_ubyte v12, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ubyte v13, v[1:2], off offset:21 -; CHECK-NEXT: global_load_ubyte v14, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ubyte v15, v[1:2], off offset:19 -; CHECK-NEXT: global_load_ubyte v16, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ubyte v17, v[1:2], off offset:17 -; CHECK-NEXT: global_load_ubyte v18, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ubyte v19, v[1:2], off offset:15 -; CHECK-NEXT: global_load_ubyte v20, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ubyte v21, v[1:2], off offset:13 -; CHECK-NEXT: global_load_ubyte v22, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ubyte v23, v[1:2], off offset:11 -; CHECK-NEXT: global_load_ubyte v24, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ubyte v25, v[1:2], off offset:9 -; CHECK-NEXT: global_load_ubyte v26, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ubyte v27, v[1:2], off offset:7 -; CHECK-NEXT: global_load_ubyte v28, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ubyte v29, v[1:2], off offset:5 -; CHECK-NEXT: global_load_ubyte v30, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ubyte v31, v[1:2], off offset:3 -; CHECK-NEXT: global_load_ubyte v32, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ubyte v33, v[1:2], off offset:1 -; CHECK-NEXT: global_load_ubyte v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:3 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v33, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(4) noundef nonnull align 1 %src, i64 32, i1 false) @@ -7775,31 +4414,12 @@ define void @memmove_p5_p4_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: global_load_ushort v3, v[1:2], off -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:2 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:4 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 16, i1 false) @@ -7810,55 +4430,24 @@ define void @memmove_p5_p4_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ubyte v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: global_load_dwordx3 v[5:7], v[1:2], off offset:16 +; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:28 +; CHECK-NEXT: global_load_ubyte v9, v[1:2], off offset:30 +; CHECK-NEXT: global_load_dwordx4 v[1:4], v[1:2], off ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 31, i1 false) @@ -7869,55 +4458,19 @@ define void @memmove_p5_p4_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p4_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: global_load_ushort v3, v[1:2], off offset:30 -; CHECK-NEXT: global_load_ushort v4, v[1:2], off offset:28 -; CHECK-NEXT: global_load_ushort v5, v[1:2], off offset:26 -; CHECK-NEXT: global_load_ushort v6, v[1:2], off offset:24 -; CHECK-NEXT: global_load_ushort v7, v[1:2], off offset:22 -; CHECK-NEXT: global_load_ushort v8, v[1:2], off offset:20 -; CHECK-NEXT: global_load_ushort v9, v[1:2], off offset:18 -; CHECK-NEXT: global_load_ushort v10, v[1:2], off offset:16 -; CHECK-NEXT: global_load_ushort v11, v[1:2], off offset:14 -; CHECK-NEXT: global_load_ushort v12, v[1:2], off offset:12 -; CHECK-NEXT: global_load_ushort v13, v[1:2], off offset:10 -; CHECK-NEXT: global_load_ushort v14, v[1:2], off offset:8 -; CHECK-NEXT: global_load_ushort v15, v[1:2], off offset:6 -; CHECK-NEXT: global_load_ushort v16, v[1:2], off offset:4 -; CHECK-NEXT: global_load_ushort v17, v[1:2], off offset:2 -; CHECK-NEXT: global_load_ushort v1, v[1:2], off -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x1 +; CHECK-NEXT: global_load_dwordx4 v[3:6], v[1:2], off +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v17, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p4.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(4) noundef nonnull align 2 %src, i64 32, i1 false) @@ -8062,55 +4615,19 @@ define void @memmove_p5_p5_sz16_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz16_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:8 -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:7 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:6 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:5 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 16, i1 false) @@ -8121,100 +4638,34 @@ define void @memmove_p5_p5_sz31_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz31_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1e -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:9 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:7 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:5 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 31, i1 false) @@ -8225,103 +4676,31 @@ define void @memmove_p5_p5_sz32_align_1_1(ptr addrspace(5) align 1 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz32_align_1_1: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x1f -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:31 -; CHECK-NEXT: buffer_load_ubyte v3, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ubyte v4, v1, s[0:3], 0 offen offset:29 -; CHECK-NEXT: buffer_load_ubyte v5, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ubyte v6, v1, s[0:3], 0 offen offset:27 -; CHECK-NEXT: buffer_load_ubyte v7, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ubyte v8, v1, s[0:3], 0 offen offset:25 -; CHECK-NEXT: buffer_load_ubyte v9, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ubyte v10, v1, s[0:3], 0 offen offset:23 -; CHECK-NEXT: buffer_load_ubyte v11, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ubyte v12, v1, s[0:3], 0 offen offset:21 -; CHECK-NEXT: buffer_load_ubyte v13, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ubyte v14, v1, s[0:3], 0 offen offset:19 -; CHECK-NEXT: buffer_load_ubyte v15, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ubyte v16, v1, s[0:3], 0 offen offset:17 -; CHECK-NEXT: buffer_load_ubyte v17, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ubyte v18, v1, s[0:3], 0 offen offset:15 -; CHECK-NEXT: buffer_load_ubyte v19, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ubyte v20, v1, s[0:3], 0 offen offset:13 -; CHECK-NEXT: buffer_load_ubyte v21, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ubyte v22, v1, s[0:3], 0 offen offset:11 -; CHECK-NEXT: buffer_load_ubyte v23, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ubyte v24, v1, s[0:3], 0 offen offset:9 -; CHECK-NEXT: buffer_load_ubyte v25, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ubyte v26, v1, s[0:3], 0 offen offset:7 -; CHECK-NEXT: buffer_load_ubyte v27, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ubyte v28, v1, s[0:3], 0 offen offset:5 -; CHECK-NEXT: buffer_load_ubyte v29, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ubyte v30, v1, s[0:3], 0 offen offset:3 -; CHECK-NEXT: buffer_load_ubyte v31, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ubyte v32, v1, s[0:3], 0 offen offset:1 -; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(31) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:31 -; CHECK-NEXT: s_waitcnt vmcnt(30) -; CHECK-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(29) -; CHECK-NEXT: buffer_store_byte v4, v0, s[0:3], 0 offen offset:29 -; CHECK-NEXT: s_waitcnt vmcnt(28) -; CHECK-NEXT: buffer_store_byte v5, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(27) -; CHECK-NEXT: buffer_store_byte v6, v0, s[0:3], 0 offen offset:27 -; CHECK-NEXT: s_waitcnt vmcnt(26) -; CHECK-NEXT: buffer_store_byte v7, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(25) -; CHECK-NEXT: buffer_store_byte v8, v0, s[0:3], 0 offen offset:25 -; CHECK-NEXT: s_waitcnt vmcnt(24) -; CHECK-NEXT: buffer_store_byte v9, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(23) -; CHECK-NEXT: buffer_store_byte v10, v0, s[0:3], 0 offen offset:23 -; CHECK-NEXT: s_waitcnt vmcnt(22) -; CHECK-NEXT: buffer_store_byte v11, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(21) -; CHECK-NEXT: buffer_store_byte v12, v0, s[0:3], 0 offen offset:21 -; CHECK-NEXT: s_waitcnt vmcnt(20) -; CHECK-NEXT: buffer_store_byte v13, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(19) -; CHECK-NEXT: buffer_store_byte v14, v0, s[0:3], 0 offen offset:19 -; CHECK-NEXT: s_waitcnt vmcnt(18) -; CHECK-NEXT: buffer_store_byte v15, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(17) -; CHECK-NEXT: buffer_store_byte v16, v0, s[0:3], 0 offen offset:17 -; CHECK-NEXT: s_waitcnt vmcnt(16) -; CHECK-NEXT: buffer_store_byte v17, v0, s[0:3], 0 offen offset:16 -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v18, v0, s[0:3], 0 offen offset:15 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_byte v19, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_byte v20, v0, s[0:3], 0 offen offset:13 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_byte v21, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_byte v22, v0, s[0:3], 0 offen offset:11 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_byte v23, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_byte v24, v0, s[0:3], 0 offen offset:9 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_byte v25, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_byte v26, v0, s[0:3], 0 offen offset:7 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_byte v27, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_byte v28, v0, s[0:3], 0 offen offset:5 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_byte v29, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_byte v30, v0, s[0:3], 0 offen offset:3 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_byte v31, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_byte v32, v0, s[0:3], 0 offen offset:1 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 1 %dst, ptr addrspace(5) noundef nonnull align 1 %src, i64 32, i1 false) @@ -8332,31 +4711,19 @@ define void @memmove_p5_p5_sz16_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz16_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0x7 -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:14 -; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:12 -; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:10 -; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: s_clause 0x3 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 16, i1 false) @@ -8367,55 +4734,34 @@ define void @memmove_p5_p5_sz31_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz31_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ubyte v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 +; CHECK-NEXT: s_clause 0x8 +; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v9, v1, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v9, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen offset:30 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 31, i1 false) @@ -8426,55 +4772,31 @@ define void @memmove_p5_p5_sz32_align_2_2(ptr addrspace(5) align 2 %dst, ptr add ; CHECK-LABEL: memmove_p5_p5_sz32_align_2_2: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_clause 0xf -; CHECK-NEXT: buffer_load_ushort v2, v1, s[0:3], 0 offen offset:30 -; CHECK-NEXT: buffer_load_ushort v3, v1, s[0:3], 0 offen offset:28 -; CHECK-NEXT: buffer_load_ushort v4, v1, s[0:3], 0 offen offset:26 -; CHECK-NEXT: buffer_load_ushort v5, v1, s[0:3], 0 offen offset:24 -; CHECK-NEXT: buffer_load_ushort v6, v1, s[0:3], 0 offen offset:22 -; CHECK-NEXT: buffer_load_ushort v7, v1, s[0:3], 0 offen offset:20 -; CHECK-NEXT: buffer_load_ushort v8, v1, s[0:3], 0 offen offset:18 -; CHECK-NEXT: buffer_load_ushort v9, v1, s[0:3], 0 offen offset:16 -; CHECK-NEXT: buffer_load_ushort v10, v1, s[0:3], 0 offen offset:14 -; CHECK-NEXT: buffer_load_ushort v11, v1, s[0:3], 0 offen offset:12 -; CHECK-NEXT: buffer_load_ushort v12, v1, s[0:3], 0 offen offset:10 -; CHECK-NEXT: buffer_load_ushort v13, v1, s[0:3], 0 offen offset:8 -; CHECK-NEXT: buffer_load_ushort v14, v1, s[0:3], 0 offen offset:6 -; CHECK-NEXT: buffer_load_ushort v15, v1, s[0:3], 0 offen offset:4 -; CHECK-NEXT: buffer_load_ushort v16, v1, s[0:3], 0 offen offset:2 -; CHECK-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen -; CHECK-NEXT: s_waitcnt vmcnt(15) -; CHECK-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen offset:30 -; CHECK-NEXT: s_waitcnt vmcnt(14) -; CHECK-NEXT: buffer_store_short v3, v0, s[0:3], 0 offen offset:28 -; CHECK-NEXT: s_waitcnt vmcnt(13) -; CHECK-NEXT: buffer_store_short v4, v0, s[0:3], 0 offen offset:26 -; CHECK-NEXT: s_waitcnt vmcnt(12) -; CHECK-NEXT: buffer_store_short v5, v0, s[0:3], 0 offen offset:24 -; CHECK-NEXT: s_waitcnt vmcnt(11) -; CHECK-NEXT: buffer_store_short v6, v0, s[0:3], 0 offen offset:22 -; CHECK-NEXT: s_waitcnt vmcnt(10) -; CHECK-NEXT: buffer_store_short v7, v0, s[0:3], 0 offen offset:20 -; CHECK-NEXT: s_waitcnt vmcnt(9) -; CHECK-NEXT: buffer_store_short v8, v0, s[0:3], 0 offen offset:18 -; CHECK-NEXT: s_waitcnt vmcnt(8) -; CHECK-NEXT: buffer_store_short v9, v0, s[0:3], 0 offen offset:16 +; CHECK-NEXT: s_clause 0x7 +; CHECK-NEXT: buffer_load_dword v2, v1, s[0:3], 0 offen offset:24 +; CHECK-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:28 +; CHECK-NEXT: buffer_load_dword v4, v1, s[0:3], 0 offen offset:16 +; CHECK-NEXT: buffer_load_dword v5, v1, s[0:3], 0 offen offset:20 +; CHECK-NEXT: buffer_load_dword v6, v1, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_load_dword v7, v1, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_load_dword v8, v1, s[0:3], 0 offen +; CHECK-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(7) -; CHECK-NEXT: buffer_store_short v10, v0, s[0:3], 0 offen offset:14 +; CHECK-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen offset:24 ; CHECK-NEXT: s_waitcnt vmcnt(6) -; CHECK-NEXT: buffer_store_short v11, v0, s[0:3], 0 offen offset:12 +; CHECK-NEXT: buffer_store_dword v3, v0, s[0:3], 0 offen offset:28 ; CHECK-NEXT: s_waitcnt vmcnt(5) -; CHECK-NEXT: buffer_store_short v12, v0, s[0:3], 0 offen offset:10 +; CHECK-NEXT: buffer_store_dword v4, v0, s[0:3], 0 offen offset:16 ; CHECK-NEXT: s_waitcnt vmcnt(4) -; CHECK-NEXT: buffer_store_short v13, v0, s[0:3], 0 offen offset:8 +; CHECK-NEXT: buffer_store_dword v5, v0, s[0:3], 0 offen offset:20 ; CHECK-NEXT: s_waitcnt vmcnt(3) -; CHECK-NEXT: buffer_store_short v14, v0, s[0:3], 0 offen offset:6 +; CHECK-NEXT: buffer_store_dword v6, v0, s[0:3], 0 offen offset:8 ; CHECK-NEXT: s_waitcnt vmcnt(2) -; CHECK-NEXT: buffer_store_short v15, v0, s[0:3], 0 offen offset:4 +; CHECK-NEXT: buffer_store_dword v7, v0, s[0:3], 0 offen offset:12 ; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: buffer_store_short v16, v0, s[0:3], 0 offen offset:2 +; CHECK-NEXT: buffer_store_dword v8, v0, s[0:3], 0 offen ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; CHECK-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen offset:4 ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: tail call void @llvm.memmove.p5.p5.i64(ptr addrspace(5) noundef nonnull align 2 %dst, ptr addrspace(5) noundef nonnull align 2 %src, i64 32, i1 false) diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll index 94bc6d46b239..8ad6a4e534d2 100644 --- a/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll +++ b/llvm/test/CodeGen/AMDGPU/sdwa-commute.ll @@ -19,8 +19,8 @@ define void @extracted_values(ptr %ret_struct, ptr addrspace(3) %arg0, ptr addrs ; CHECK-NEXT: v_sub_f16_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; CHECK-NEXT: v_add_f16_e32 v4, v6, v7 ; CHECK-NEXT: v_add_f16_e32 v2, v3, v2 -; CHECK-NEXT: flat_store_short v[0:1], v4 -; CHECK-NEXT: flat_store_short v[0:1], v2 offset:2 +; CHECK-NEXT: v_pack_b32_f16 v2, v4, v2 +; CHECK-NEXT: flat_store_dword v[0:1], v2 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll index 4e734d6e0884..fc33a274d7b1 100644 --- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll +++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -607,7 +607,14 @@ define amdgpu_kernel void @local_store_align1_v16i8(ptr addrspace(3) %out) #0 { ; MUBUF: buffer_load_ubyte ; MUBUF: buffer_load_ubyte ; MUBUF: buffer_load_ubyte -; FLATSCR: scratch_load_dwordx2 +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte +; FLATSCR: scratch_load_ubyte define double @private_load_align1_f64(ptr addrspace(5) %in) { %x = load double, ptr addrspace(5) %in, align 1 ret double %x @@ -622,7 +629,14 @@ define double @private_load_align1_f64(ptr addrspace(5) %in) { ; MUBUF: buffer_store_byte ; MUBUF: buffer_store_byte ; MUBUF: buffer_store_byte -; FLATSCR: scratch_store_dwordx2 +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte +; FLATSCR: scratch_store_byte define void @private_store_align1_f64(ptr addrspace(5) %out, double %x) #0 { store double %x, ptr addrspace(5) %out, align 1 ret void @@ -651,7 +665,10 @@ define void @private_store_align4_f64(ptr addrspace(5) %out, double %x) #0 { ; MUBUF: buffer_load_ushort ; MUBUF: buffer_load_ushort ; MUBUF: buffer_load_ushort -; FLATSCR: scratch_load_dwordx2 +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort +; FLATSCR: scratch_load_ushort define double @private_load_align2_f64(ptr addrspace(5) %in) { %x = load double, ptr addrspace(5) %in, align 2 ret double %x @@ -662,7 +679,10 @@ define double @private_load_align2_f64(ptr addrspace(5) %in) { ; MUBUF: buffer_store_short ; MUBUF: buffer_store_short ; MUBUF: buffer_store_short -; FLATSCR: scratch_store_dwordx2 +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short +; FLATSCR: scratch_store_short define void @private_store_align2_f64(ptr addrspace(5) %out, double %x) #0 { store double %x, ptr addrspace(5) %out, align 2 ret void -- GitLab From 65780f4d8e34461e6bd3baf2ff77496f97874b94 Mon Sep 17 00:00:00 2001 From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com> Date: Fri, 11 Oct 2024 08:23:35 +0100 Subject: [PATCH 002/345] [C++20][Modules] Allow import for a header unit after #pragma (#111662) Summary: `#pragma` and headers that finish with them shouldn't prevent `import "header_unit.h"` syntax. Test Plan: check-clang --- clang/lib/Lex/Preprocessor.cpp | 4 ++++ .../import_header_unit_after_pragma.cpp | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 clang/test/Headers/import_header_unit_after_pragma.cpp diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp index f0b4593e0cc2..ecc5166d7b81 100644 --- a/clang/lib/Lex/Preprocessor.cpp +++ b/clang/lib/Lex/Preprocessor.cpp @@ -902,6 +902,10 @@ void Preprocessor::Lex(Token &Result) { case tok::r_brace: StdCXXImportSeqState.handleCloseBrace(); break; +#define PRAGMA_ANNOTATION(X) case tok::annot_##X: +// For `#pragma ...` mimic ';'. +#include "clang/Basic/TokenKinds.def" +#undef PRAGMA_ANNOTATION // This token is injected to represent the translation of '#include "a.h"' // into "import a.h;". Mimic the notional ';'. case tok::annot_module_include: diff --git a/clang/test/Headers/import_header_unit_after_pragma.cpp b/clang/test/Headers/import_header_unit_after_pragma.cpp new file mode 100644 index 000000000000..b1ad3b07fea2 --- /dev/null +++ b/clang/test/Headers/import_header_unit_after_pragma.cpp @@ -0,0 +1,18 @@ +// RUN: rm -fR %t +// RUN: split-file %s %t +// RUN: cd %t +// RUN: %clang_cc1 -verify -std=c++20 -emit-header-unit -xc++-user-header bz0.h +// RUN: %clang_cc1 -verify -std=c++20 -emit-header-unit -xc++-user-header -fmodule-file=bz0.pcm bz.cpp + +//--- compare +#pragma GCC visibility push(default) +#pragma GCC visibility pop + +//--- bz0.h +#include "compare" +// expected-no-diagnostics + +//--- bz.cpp +#include "compare" + +import "bz0.h"; // expected-warning {{the implementation of header units is in an experimental phase}} -- GitLab From ff04bb8f4064274aedcb6e916079132ab6042a10 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 11 Oct 2024 09:31:49 +0200 Subject: [PATCH 003/345] [clang][bytecode] Use PredefinedExpr as base for its variable (#111956) This fixes the error message generated. --- clang/lib/AST/ByteCode/Compiler.cpp | 5 +++++ clang/lib/AST/ByteCode/Program.cpp | 15 +++++++++------ clang/lib/AST/ByteCode/Program.h | 3 ++- clang/test/AST/ByteCode/cxx1z.cpp | 4 ++++ 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 0a3b38b0dc6e..b2663714340b 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -2869,6 +2869,11 @@ bool Compiler::VisitPredefinedExpr(const PredefinedExpr *E) { if (DiscardResult) return true; + if (!Initializing) { + unsigned StringIndex = P.createGlobalString(E->getFunctionName(), E); + return this->emitGetPtrGlobal(StringIndex, E); + } + return this->delegate(E->getFunctionName()); } diff --git a/clang/lib/AST/ByteCode/Program.cpp b/clang/lib/AST/ByteCode/Program.cpp index 23245a66b578..cd2665f755d7 100644 --- a/clang/lib/AST/ByteCode/Program.cpp +++ b/clang/lib/AST/ByteCode/Program.cpp @@ -33,7 +33,7 @@ const void *Program::getNativePointer(unsigned Idx) { return NativePointers[Idx]; } -unsigned Program::createGlobalString(const StringLiteral *S) { +unsigned Program::createGlobalString(const StringLiteral *S, const Expr *Base) { const size_t CharWidth = S->getCharByteWidth(); const size_t BitWidth = CharWidth * Ctx.getCharBit(); @@ -52,12 +52,15 @@ unsigned Program::createGlobalString(const StringLiteral *S) { llvm_unreachable("unsupported character width"); } + if (!Base) + Base = S; + // Create a descriptor for the string. - Descriptor *Desc = - allocateDescriptor(S, CharType, Descriptor::GlobalMD, S->getLength() + 1, - /*isConst=*/true, - /*isTemporary=*/false, - /*isMutable=*/false); + Descriptor *Desc = allocateDescriptor(Base, CharType, Descriptor::GlobalMD, + S->getLength() + 1, + /*isConst=*/true, + /*isTemporary=*/false, + /*isMutable=*/false); // Allocate storage for the string. // The byte length does not include the null terminator. diff --git a/clang/lib/AST/ByteCode/Program.h b/clang/lib/AST/ByteCode/Program.h index be84c40714a6..f676672fb7ce 100644 --- a/clang/lib/AST/ByteCode/Program.h +++ b/clang/lib/AST/ByteCode/Program.h @@ -64,7 +64,8 @@ public: const void *getNativePointer(unsigned Idx); /// Emits a string literal among global data. - unsigned createGlobalString(const StringLiteral *S); + unsigned createGlobalString(const StringLiteral *S, + const Expr *Base = nullptr); /// Returns a pointer to a global. Pointer getPtrGlobal(unsigned Idx) const; diff --git a/clang/test/AST/ByteCode/cxx1z.cpp b/clang/test/AST/ByteCode/cxx1z.cpp index 1a06597fa348..57f99235a2b2 100644 --- a/clang/test/AST/ByteCode/cxx1z.cpp +++ b/clang/test/AST/ByteCode/cxx1z.cpp @@ -13,3 +13,7 @@ namespace Temp { char arr[3]; A d; // both-error {{refers to subobject '&arr[1]'}} + +void Func() { + A a; // both-error {{pointer to subobject of predefined '__func__' variable}} +} -- GitLab From bff2b8c06f362b6b4c761fc1d3951da2bddf17de Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Fri, 11 Oct 2024 15:56:39 +0800 Subject: [PATCH 004/345] [mlir][sparse][test] Adjust tests for `LowerSparseOpsToForeach` (#110976) This PR relocates the tests added in #109435 to a new file named `no_lowering.mlir` and adds some new tests. --- mlir/test/Dialect/SparseTensor/codegen.mlir | 16 ------ .../Dialect/SparseTensor/no_lowering.mlir | 54 +++++++++++++++++++ 2 files changed, 54 insertions(+), 16 deletions(-) create mode 100644 mlir/test/Dialect/SparseTensor/no_lowering.mlir diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir index df03d871ba3a..af78458f1093 100644 --- a/mlir/test/Dialect/SparseTensor/codegen.mlir +++ b/mlir/test/Dialect/SparseTensor/codegen.mlir @@ -826,19 +826,3 @@ func.func @sparse_new_coo_permute_no(%arg0: !llvm.ptr) -> tensor return %0 : tensor } - -// CHECK-LABEL: func.func @test_tensor_dim_unranked -// CHECK: tensor.dim -func.func @test_tensor_dim_unranked(%arg0: tensor<*xf32>) -> index { - %c = arith.constant 0 : index - %0 = tensor.dim %arg0, %c : tensor<*xf32> - return %0 : index -} - -// CHECK-LABEL: func.func @test_tensor_reshape_unranked -// CHECK: tensor.reshape -func.func @test_tensor_reshape_unranked(%src: tensor<*xf32>, %shape: tensor<1xi32>) -> tensor { - %dst = tensor.reshape %src(%shape) - : (tensor<*xf32>, tensor<1xi32>) -> tensor - return %dst : tensor -} diff --git a/mlir/test/Dialect/SparseTensor/no_lowering.mlir b/mlir/test/Dialect/SparseTensor/no_lowering.mlir new file mode 100644 index 000000000000..4f21055a13d5 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/no_lowering.mlir @@ -0,0 +1,54 @@ +// RUN: mlir-opt %s --lower-sparse-ops-to-foreach --split-input-file | FileCheck %s + +// Ensure that we exit gracefully rather than crashing. + +// CHECK-LABEL: func.func @test_tensor_dim_unranked +// CHECK: tensor.dim +func.func @test_tensor_dim_unranked(%arg0: tensor<*xf32>) -> index { + %c = arith.constant 0 : index + %0 = tensor.dim %arg0, %c : tensor<*xf32> + return %0 : index +} + +// ----- + +#SparseVector = #sparse_tensor.encoding<{ + map = (d0) -> (d0 : compressed) +}> + +// CHECK-LABEL: func.func @test_no_constant_dim +// CHECK: tensor.dim +func.func @test_no_constant_dim(%arg0: tensor, %arg1: index) -> index { + %0 = tensor.dim %arg0, %arg1 : tensor + return %0 : index +} + +// ----- + +// CHECK-LABEL: func.func @test_tensor_dim_no_encoding +// CHECK: tensor.dim +func.func @test_tensor_dim_no_encoding(%arg0: tensor) -> index { + %c = arith.constant 0 : index + %0 = tensor.dim %arg0, %c : tensor + return %0 : index +} + +// ----- + +// CHECK-LABEL: func.func @test_tensor_reshape_unranked +// CHECK: tensor.reshape +func.func @test_tensor_reshape_unranked(%src: tensor<*xf32>, %shape: tensor<1xi32>) -> tensor { + %dst = tensor.reshape %src(%shape) + : (tensor<*xf32>, tensor<1xi32>) -> tensor + return %dst : tensor +} + +// ----- + +// CHECK-LABEL: func.func @test_tensor_reshape_no_encoding +// CHECK: tensor.reshape +func.func @test_tensor_reshape_no_encoding(%src: tensor, %shape: tensor<1xi32>) -> tensor { + %dst = tensor.reshape %src(%shape) + : (tensor, tensor<1xi32>) -> tensor + return %dst : tensor +} -- GitLab From 8bb12ca28f7f195aa483fdb5921681ec373564ab Mon Sep 17 00:00:00 2001 From: Vlad Serebrennikov Date: Fri, 11 Oct 2024 11:17:09 +0300 Subject: [PATCH 005/345] [clang][NFC] Update `cxx_dr_status.html` --- clang/www/cxx_dr_status.html | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index 1a67b6103cf4..6f3cc8247d2e 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -17128,11 +17128,11 @@ objects Undesired outcomes with const_cast Not resolved - + 2880 - open + accepted Accessibility check for destructor of incomplete class type - Not resolved + Unknown 2881 @@ -17260,7 +17260,7 @@ objects 2901 - review + tentatively ready Unclear semantics for near-match aliased access Not resolved @@ -17408,31 +17408,31 @@ objects 2923 - open + tentatively ready Note about infinite loops and execution steps Not resolved 2924 - open + review Undefined behavior during constant evaluation Not resolved - + 2925 - open + NAD Deleting a pointer to an incomplete enumeration type - Not resolved + Unknown 2926 - open + tentatively ready Lookup context for dependent qualified names Not resolved 2927 - open + review Unclear status of translation unit with module keyword Not resolved -- GitLab From bb4696ce3051be820de91c8c98b2649af1680236 Mon Sep 17 00:00:00 2001 From: Dmitriy Smirnov Date: Fri, 11 Oct 2024 09:39:19 +0100 Subject: [PATCH 006/345] [mlir][linalg] Fix for bias handling for Winograd (#110331) PR makes winograd.output_transform op a destination style op and fixes handing of a pre-existing data in its output argument (i.e. possibly pre-initialized with bias, which was discarded before). --------- Signed-off-by: Dmitriy Smirnov --- .../mlir/Dialect/Linalg/IR/LinalgOps.td | 3 +- .../Linalg/Transforms/WinogradConv2D.cpp | 114 +++++++++--------- .../transform-tile-and-winograd-rewrite.mlir | 51 ++++---- .../Linalg/transform-tile-winograd.mlir | 26 ++-- .../Linalg/winograd-conv2d-rewrite.mlir | 17 +-- 5 files changed, 106 insertions(+), 105 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index 5b6a90f806be..e42fd5d2ce13 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -313,7 +313,7 @@ def Linalg_WinogradInputTransformOp : Linalg_Op<"winograd_input_transform", } def Linalg_WinogradOutputTransformOp : Linalg_Op<"winograd_output_transform", - [AllElementTypesMatch<["value", "output"]>, + [AllElementTypesMatch<["value", "output"]>, DestinationStyleOpInterface, DeclareOpInterfaceMethods scf::ValueVector { + auto context = builder.getContext(); Value tileHIter = ivs[0]; Value tileWIter = ivs[1]; Value NIter = ivs[2]; @@ -740,29 +741,41 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value, FIter, 2, 3, /*loopNorFIdx=*/4, /*loopCorFIdx=*/5, /*heightIdx=*/0, /*widthIdx=*/1); - TransformMapKeyTy key = {m, r}; - int64_t retRows = 1; - int64_t retCols = 1; - int64_t leftScalarFactor = 1; - int64_t rightScalarFactor = 1; + const TransformMapKeyTy key = {m, r}; + const TransformMatrix &AMatrix = AMatrices.at(key); + const TransformMatrix &ATMatrix = ATMatrices.at(key); + int64_t scalarFactor = (rightTransform ? AMatrix.scalarFactor : 1) * + (leftTransform ? ATMatrix.scalarFactor : 1); + int64_t retCols = rightTransform ? AMatrix.cols : 1; + int64_t retRows = leftTransform ? ATMatrix.rows : 1; + Value matmulRetValue = extractValue; Value zero = builder.create( loc, rewriter.getZeroAttr(elementType)); - if (leftTransform) { - // Get constant transform matrix AT. - auto it = ATMatrices.find(key); - if (it == ATMatrices.end()) - return {}; - const TransformMatrix &ATMatrix = it->second; - leftScalarFactor = ATMatrix.scalarFactor; - retRows = ATMatrix.rows; + auto affineMap = + AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); + Value heightOffset = + builder.create(loc, affineMap, tileHIter); + Value widthOffset = + builder.create(loc, affineMap, tileWIter); + + Value outInitVal = + extract2DDataFrom4D(builder, loc, args[0], NIter, FIter, heightOffset, + widthOffset, retRows, retCols, + /*loopNorFIdx=*/0, + /*loopCorFIdx=*/3, /*heightIdx=*/1, + /*widthIdx=*/2); + if (leftTransform) { auto matmulType = RankedTensorType::get({retRows, valueW}, elementType); - auto empty = - builder - .create(loc, matmulType.getShape(), elementType) - .getResult(); - auto init = builder.create(loc, zero, empty).getResult(0); + Value init = outInitVal; + if (rightTransform || scalarFactor != 1) { + auto empty = builder + .create(loc, matmulType.getShape(), + elementType) + .getResult(); + init = builder.create(loc, zero, empty).getResult(0); + } Value AT = create2DTransformMatrix(builder, loc, ATMatrix, elementType); // Multiply AT x m. @@ -772,21 +785,16 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value, } if (rightTransform) { - // Get constant transform matrix T. - auto it = AMatrices.find(key); - if (it == AMatrices.end()) - return {}; - const TransformMatrix &AMatrix = it->second; - - rightScalarFactor = AMatrix.scalarFactor; auto matmulType = RankedTensorType::get({retRows, AMatrix.cols}, elementType); - retCols = AMatrix.cols; - auto empty = - builder - .create(loc, matmulType.getShape(), elementType) - .getResult(); - auto init = builder.create(loc, zero, empty).getResult(0); + Value init = outInitVal; + if (scalarFactor != 1) { + auto empty = builder + .create(loc, matmulType.getShape(), + elementType) + .getResult(); + init = builder.create(loc, zero, empty).getResult(0); + } Value A = create2DTransformMatrix(builder, loc, AMatrix, elementType); // Multiply y = (AT x m) x A. @@ -795,48 +803,36 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value, matmulRetValue = matmulOp.getResult(0); } - if (leftScalarFactor * rightScalarFactor != 1) { - // Multiply scalar factor. - Value scalarFactor = builder.create( - loc, - FloatAttr::get(elementType, leftScalarFactor * rightScalarFactor)); + if (scalarFactor != 1) { + // Multiply by scalar factor and add outInitVal. + Value scalarFactorValue = builder.create( + loc, FloatAttr::get(elementType, scalarFactor)); auto matmulType = RankedTensorType::get({retRows, retCols}, elementType); - auto init = builder.create(loc, matmulType.getShape(), - elementType); - auto identityAffineMap = rewriter.getMultiDimIdentityMap(2); SmallVector affineMaps = { - AffineMap::get(2, 0, init.getContext()), identityAffineMap}; - auto broadcastedScalar = + AffineMap::get(2, 0, context), identityAffineMap, identityAffineMap}; + + matmulRetValue = rewriter .create( - loc, matmulType, ValueRange{scalarFactor}, ValueRange{init}, - affineMaps, + loc, matmulType, + ValueRange{scalarFactorValue, matmulRetValue}, + ValueRange{outInitVal}, affineMaps, llvm::ArrayRef{ utils::IteratorType::parallel, utils::IteratorType::parallel}, [&](OpBuilder &nestedBuilder, Location nestedLoc, ValueRange args) { - nestedBuilder.create(nestedLoc, args[0]); + auto mulf = nestedBuilder.create( + nestedLoc, args[0], args[1]); + auto addf = nestedBuilder.create( + nestedLoc, mulf.getResult(), args[2]); + nestedBuilder.create(nestedLoc, + addf.getResult()); }) .getResult(0); - - matmulRetValue = builder - .create( - loc, matmulType, - ValueRange{broadcastedScalar, matmulRetValue}, - ValueRange{init}) - .getResult(0); } - auto context = builder.getContext(); - auto affineMap = - AffineMap::get(1, 0, {builder.getAffineDimExpr(0) * m}, context); - Value heightOffset = - builder.create(loc, affineMap, tileHIter); - Value widthOffset = - builder.create(loc, affineMap, tileWIter); - // Insert (H, W) to (N, H, W, F). Value combinedVal = insert2DDataTo4D(builder, loc, matmulRetValue, args[0], NIter, FIter, diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir index c5760acf94a8..776dc5b748c8 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir @@ -85,31 +85,32 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[S9]] // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] // CHECK: %[[COLLAPSED_6:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S7:.*]] = tensor.empty() // CHECK: %[[S6:.*]] = linalg.batch_matmul // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] -// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x8x8x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S7]]) +// CHECK: %[[S8:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]]) // CHECK: %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] // CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) // CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG2]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: %[[EXTRACTED_SLICE_7:.*]] = tensor.extract_slice %[[ARG6]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] // CHECK: %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]]) // CHECK: %[[S15:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) // CHECK: %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S25:.*]] = tensor.extract_slice %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] // CHECK: %[[S16:.*]] = tensor.empty() : tensor<4x6xf32> // CHECK: %[[S17:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S16]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK: %[[S18:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_8]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK: %[[S19:.*]] = tensor.empty() : tensor<4x4xf32> // CHECK: %[[S20:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S19]] : tensor<4x4xf32>) -> tensor<4x4xf32> // CHECK: %[[S21:.*]] = linalg.matmul ins(%[[S18]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[S22:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK: %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S21]] : f32, tensor<4x4xf32>) outs(%[[S25]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: %[[VAL_90:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32 +// CHECK: %[[VAL_91:.*]] = arith.addf %[[VAL_90]], %[[OUT]] : f32 +/// CHECK: linalg.yield %[[VAL_91]] : f32 // CHECK: } -> tensor<4x4xf32> -// CHECK: %[[S24:.*]] = linalg.mul ins(%[[S23]], %[[S21]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S24]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] +// CHECK: %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S23]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] // CHECK: scf.yield %[[INSERTED_SLICE_9]] // CHECK: scf.yield %[[S15]] // CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) @@ -218,32 +219,33 @@ module attributes {transform.with_named_sequence} { // CHECK: scf.yield %[[S9]] // CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] // CHECK: %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S4]] {{\[}}[0, 1], [2, 3, 4], [5]] +// CHECK: %[[S7:.*]] = tensor.empty() // CHECK: %[[S6:.*]] = linalg.batch_matmul // CHECK: %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] // CHECK: %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] -// CHECK: %[[S7:.*]] = tensor.empty() : tensor<2x12x12x2xf32> -// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S7]]) +// CHECK: %[[S8:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[PADDED_8]]) // CHECK: %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C3]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]]) // CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] // CHECK: %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) // CHECK: %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG6]]) -// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[PADDED_8]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] +// CHECK: %[[EXTRACTED_SLICE_10:.*]] = tensor.extract_slice %[[ARG7]][0, %[[S10]], %[[S11]], 0] [2, 4, 4, 2] [1, 1, 1, 1] // CHECK: %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]]) // CHECK: %[[S15:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]]) // CHECK: %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S26:.*]] = tensor.extract_slice %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] // CHECK: %[[S17:.*]] = tensor.empty() : tensor<4x6xf32> // CHECK: %[[S18:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK: %[[S19:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_11]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S18]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK: %[[S20:.*]] = tensor.empty() : tensor<4x4xf32> // CHECK: %[[S21:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32> // CHECK: %[[S22:.*]] = linalg.matmul ins(%[[S19]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S21]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[S23:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK: %[[S24:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S23]] : tensor<4x4xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: %[[S24:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S22]] : f32, tensor<4x4xf32>) outs(%[[S26]] : tensor<4x4xf32>) { +// CHECK: ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: %[[VAL_104:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32 +// CHECK: %[[VAL_105:.*]] = arith.addf %[[VAL_104]], %[[OUT]] : f32 +/// CHECK: linalg.yield %[[VAL_105]] : f32 // CHECK: } -> tensor<4x4xf32> -// CHECK: %[[S25:.*]] = linalg.mul ins(%[[S24]], %[[S22]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S23]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S25]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] +// CHECK: %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S24]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1] // CHECK: scf.yield %[[INSERTED_SLICE_12]] // CHECK: scf.yield %[[S15]] : tensor<2x4x4x2xf32> // CHECK: %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) @@ -330,16 +332,17 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]]) // CHECK: %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] +// CHECK: %[[S15:.*]] = tensor.extract_slice %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] // CHECK: %[[S9:.*]] = tensor.empty() : tensor<4x1xf32> // CHECK: %[[S10:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S9]] : tensor<4x1xf32>) -> tensor<4x1xf32> // CHECK: %[[S11:.*]] = linalg.matmul ins(%[[CST_0]], %[[EXTRACTED_SLICE]] : tensor<4x6xf32>, tensor<6x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32> -// CHECK: %[[S12:.*]] = tensor.empty() : tensor<4x1xf32> -// CHECK: %[[S13:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S12]] : tensor<4x1xf32>) { -// CHECK: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK: linalg.yield %[[IN]] : f32 +// CHECK: %[[S13:.*]] = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S11]] : f32, tensor<4x1xf32>) outs(%[[S15]] : tensor<4x1xf32>) { +// CHECK: ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK: %[[VAL_57:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32 +// CHECK: %[[VAL_58:.*]] = arith.addf %[[VAL_57]], %[[OUT]] : f32 +/// CHECK: linalg.yield %[[VAL_58]] : f32 // CHECK: } -> tensor<4x1xf32> -// CHECK: %[[S14:.*]] = linalg.mul ins(%[[S13]], %[[S11]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S12]] : tensor<4x1xf32>) -> tensor<4x1xf32> -// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] +// CHECK: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1] // CHECK: scf.yield %[[INSERTED_SLICE]] // CHECK: scf.yield %[[S7]] // CHECK: return %[[S6]] diff --git a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir index 21522a2083b4..9598c434aadb 100644 --- a/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir +++ b/mlir/test/Dialect/Linalg/transform-tile-winograd.mlir @@ -279,14 +279,14 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C2_1:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[ARG1]]) -> (tensor<2x8x8x2xf32>) +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C2_1]] step %[[C1_2]] iter_args(%[[ARG6:.*]] = %[[ARG5]]) -> (tensor<2x8x8x2xf32>) // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], 0, 0] [6, 6, 1, 1, 2, 2] [1, 1, 1, 1, 1, 1] : tensor<6x6x2x2x2x2xf32> to tensor<6x6x1x1x2x2xf32> // CHECK: %[[S3:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) // CHECK: %[[S4:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) // CHECK: %[[S5:.*]] = affine.apply #[[$MAP1]]() // CHECK: %[[S6:.*]] = affine.apply #[[$MAP1]]() -// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG1]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> +// CHECK: %[[EXTRACTED_SLICE_5:.*]] = tensor.extract_slice %[[ARG6]][0, %[[S3]], %[[S4]], 0] [2, %[[S5]], %[[S6]], 2] [1, 1, 1, 1] : tensor<2x8x8x2xf32> to tensor<2x?x?x2xf32> // ----- @@ -321,10 +321,10 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C2_3:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C2_5:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C2_7:.*]] = arith.constant 2 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C2_0]] iter_args(%[[ARG9:.*]] = %[[ARG1]]) -> (tensor<3x8x8x5xf32>) +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_1]] to %[[C2_2]] step %[[C2_3]] iter_args(%[[ARG10:.*]] = %[[ARG9]]) -> (tensor<3x8x8x5xf32>) +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_4]] to %[[C3]] step %[[C2_5]] iter_args(%[[ARG11:.*]] = %[[ARG10]]) +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_6]] to %[[C5]] step %[[C2_7]] iter_args(%[[ARG12:.*]] = %[[ARG11]]) // CHECK: %[[C3_8:.*]] = arith.constant 3 : index // CHECK: %[[S5:.*]] = affine.min #[[$MAP0]](%[[ARG6]]) // CHECK: %[[C5_9:.*]] = arith.constant 5 : index @@ -334,7 +334,7 @@ module attributes {transform.with_named_sequence} { // CHECK: %[[S8:.*]] = affine.apply #[[$MAP2]](%[[ARG4]]) // CHECK: %[[S9:.*]] = affine.apply #[[$MAP3]]() // CHECK: %[[S10:.*]] = affine.apply #[[$MAP3]]() -// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor +// CHECK: %[[EXTRACTED_SLICE_12:.*]] = tensor.extract_slice %[[ARG12]][%[[ARG6]], %[[S7]], %[[S8]], %[[ARG8]]] [%[[S5]], %[[S9]], %[[S10]], %[[S6]]] [1, 1, 1, 1] : tensor<3x8x8x5xf32> to tensor // ----- @@ -367,14 +367,14 @@ module attributes {transform.with_named_sequence} { // CHECK-DAG: %[[C1_2:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_4:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C1_6:.*]] = arith.constant 1 : index -// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] -// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] -// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] -// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] +// CHECK: %[[S1:.*]] = scf.for %[[ARG2:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[ARG1]]) -> (tensor<3x8x1x5xf32>) +// CHECK: %[[S2:.*]] = scf.for %[[ARG4:.*]] = %[[C0_0]] to %[[C1_1]] step %[[C1_2]] iter_args(%[[ARG10:.*]] = %[[ARG9]]) -> (tensor<3x8x1x5xf32>) +// CHECK: %[[S3:.*]] = scf.for %[[ARG6:.*]] = %[[C0_3]] to %[[C3]] step %[[C1_4]] iter_args(%[[ARG11:.*]] = %[[ARG10]]) -> (tensor<3x8x1x5xf32>) +// CHECK: %[[S4:.*]] = scf.for %[[ARG8:.*]] = %[[C0_5]] to %[[C5]] step %[[C1_6]] iter_args(%[[ARG12:.*]] = %[[ARG11]]) -> (tensor<3x8x1x5xf32>) // CHECK: %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][0, 0, %[[ARG2]], %[[ARG4]], %[[ARG6]], %[[ARG8]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x1x2x1x3x5xf32> to tensor<6x1x1x1x1x1xf32> // CHECK: %[[S5:.*]] = affine.apply #[[$MAP0]](%[[ARG2]]) // CHECK: %[[S6:.*]] = affine.apply #[[$MAP0]](%[[ARG4]]) // CHECK: %[[S7:.*]] = affine.apply #[[$MAP1]]() // CHECK: %[[S8:.*]] = affine.apply #[[$MAP1]]() -// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> +// CHECK: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG12]][%[[ARG6]], %[[S5]], 0, %[[ARG8]]] [1, %[[S7]], 1, 1] [1, 1, 1, 1] : tensor<3x8x1x5xf32> to tensor<1x?x1x1xf32> // CHECK: %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXTRACTED_SLICE]] : tensor<6x1x1x1x1x1xf32>) outs(%[[EXTRACTED_SLICE_9]] : tensor<1x?x1x1xf32>) -> tensor<1x?x1x1xf32> diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir index 4369f5f1eab4..16d06a747327 100644 --- a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir +++ b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir @@ -100,21 +100,22 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg // CHECK-NEXT: %[[S8:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x12x12x2xf32>) { // CHECK-NEXT: %[[S9:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x12x12x2xf32>) { // CHECK-NEXT: %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6xf32> +// CHECK-NEXT: %[[S20:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) +// CHECK-NEXT: %[[S21:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) +// CHECK-NEXT: %[[S22:.*]] = tensor.extract_slice %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<4x4xf32> // CHECK-NEXT: %[[S11:.*]] = tensor.empty() : tensor<4x6xf32> // CHECK-NEXT: %[[S12:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S11]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK-NEXT: %[[S13:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_9]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S12]] : tensor<4x6xf32>) -> tensor<4x6xf32> // CHECK-NEXT: %[[S14:.*]] = tensor.empty() : tensor<4x4xf32> // CHECK-NEXT: %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<4x4xf32>) -> tensor<4x4xf32> // CHECK-NEXT: %[[S16:.*]] = linalg.matmul ins(%[[S13]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S15]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK-NEXT: %[[S17:.*]] = tensor.empty() : tensor<4x4xf32> -// CHECK-NEXT: %[[S18:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S17]] : tensor<4x4xf32>) { -// CHECK-NEXT: ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32): -// CHECK-NEXT: linalg.yield %[[IN]] : f32 +// CHECK-NEXT: %[[S18:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]], %[[S16]] : f32, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) { +// CHECK-NEXT: ^bb0(%[[IN1:.*]]: f32, %[[IN2:.*]]: f32, %[[OUT:.*]]: f32): +// CHECK-NEXT: %[[VAL_98:.*]] = arith.mulf %[[IN1]], %[[IN2]] : f32 +// CHECK-NEXT: %[[VAL_99:.*]] = arith.addf %[[VAL_98]], %[[OUT]] : f32 +// CHECK-NEXT: linalg.yield %[[VAL_99]] : f32 // CHECK-NEXT: } -> tensor<4x4xf32> -// CHECK-NEXT: %[[S19:.*]] = linalg.mul ins(%[[S18]], %[[S16]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S17]] : tensor<4x4xf32>) -> tensor<4x4xf32> -// CHECK-NEXT: %[[S20:.*]] = affine.apply #[[$MAP0]](%[[ARG3]]) -// CHECK-NEXT: %[[S21:.*]] = affine.apply #[[$MAP0]](%[[ARG5]]) -// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S19]] into %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32> +// CHECK-NEXT: %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S18]] into %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32> // CHECK-NEXT: scf.yield %[[INSERTED_SLICE]] : tensor<2x12x12x2xf32> // CHECK-NEXT: } // CHECK-NEXT: scf.yield %[[S9]] : tensor<2x12x12x2xf32> -- GitLab From ebeb56af5f8f1ff9da8f5a7e98348f460d223de1 Mon Sep 17 00:00:00 2001 From: Pavel Labath Date: Fri, 11 Oct 2024 10:40:28 +0200 Subject: [PATCH 007/345] [lldb] Only send "posix" error codes through the gdb-remote protocol (#108170) The other side has no way of telling which namespace do these codes belong to, so mashing them all together is not very helpful. I'm mainly doing this to simplify some code in a pending patch , and I've picked the posix error category semi-randomly. If we wanted to be serious about assigning meaning to these error codes, we should create a special error category for "gdb errors". -- GitLab From b222f319306a9cad9ac11183b7036ff45097c26f Mon Sep 17 00:00:00 2001 From: Dmitry Vasilyev Date: Fri, 11 Oct 2024 12:56:42 +0400 Subject: [PATCH 008/345] [lldb][test] Fixed the test `no_unique_address-with-bitfields` (#111902) Fixed the error `unable to create target: 'No available targets are compatible with triple "x86_64-apple-macosx10.4.0"'` running `clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s`. --- .../DWARF/{ => x86}/no_unique_address-with-bitfields.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lldb/test/Shell/SymbolFile/DWARF/{ => x86}/no_unique_address-with-bitfields.cpp (100%) diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp similarity index 100% rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-with-bitfields.cpp rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-with-bitfields.cpp -- GitLab From 72f339de45bb590f25571c4c447a725e6f1dd8d7 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 11 Oct 2024 10:10:15 +0100 Subject: [PATCH 009/345] [LoopVectorize] Use predicated version of getSmallConstantMaxTripCount (#109928) There are a number of places where we call getSmallConstantMaxTripCount without passing a vector of predicates: getSmallBestKnownTC isIndvarOverflowCheckKnownFalse computeMaxVF isMoreProfitable I've changed all of these to now pass in a predicate vector so that we get the benefit of making better vectorisation choices when we know the max trip count for loops that require SCEV predicate checks. I've tried to add tests that cover all the cases affected by these changes. --- llvm/include/llvm/Analysis/ScalarEvolution.h | 7 + llvm/lib/Analysis/ScalarEvolution.cpp | 10 + .../Transforms/Vectorize/LoopVectorize.cpp | 48 ++- .../AArch64/low_trip_count_predicates.ll | 397 ++++++++++++++++++ .../RISCV/riscv-vector-reverse.ll | 2 + 5 files changed, 442 insertions(+), 22 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h index 179a2c38d9d3..328926f0b7aa 100644 --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -2376,6 +2376,10 @@ public: /// Get the (predicated) symbolic max backedge count for the analyzed loop. const SCEV *getSymbolicMaxBackedgeTakenCount(); + /// Returns the upper bound of the loop trip count as a normal unsigned + /// value, or 0 if the trip count is unknown. + unsigned getSmallConstantMaxTripCount(); + /// Adds a new predicate. void addPredicate(const SCEVPredicate &Pred); @@ -2447,6 +2451,9 @@ private: /// The symbolic backedge taken count. const SCEV *SymbolicMaxBackedgeCount = nullptr; + + /// The constant max trip count for the loop. + std::optional SmallConstantMaxTripCount; }; template <> struct DenseMapInfo { diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp index 3d890f05c8ca..cea3a5bc865f 100644 --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -15050,6 +15050,16 @@ const SCEV *PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount() { return SymbolicMaxBackedgeCount; } +unsigned PredicatedScalarEvolution::getSmallConstantMaxTripCount() { + if (!SmallConstantMaxTripCount) { + SmallVector Preds; + SmallConstantMaxTripCount = SE.getSmallConstantMaxTripCount(&L, &Preds); + for (const auto *P : Preds) + addPredicate(*P); + } + return *SmallConstantMaxTripCount; +} + void PredicatedScalarEvolution::addPredicate(const SCEVPredicate &Pred) { if (Preds->implies(&Pred)) return; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index f2bee2c67a23..05dc58a42249 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -411,10 +411,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) { /// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. /// 4) Returns std::nullopt if all of the above failed. static std::optional -getSmallBestKnownTC(ScalarEvolution &SE, Loop *L, +getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax = true) { // Check if exact trip count is known. - if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L)) + if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L)) return ExpectedTC; // Check if there is an expected trip count available from profile data. @@ -426,7 +426,7 @@ getSmallBestKnownTC(ScalarEvolution &SE, Loop *L, return std::nullopt; // Check if upper bound estimate is known. - if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L)) + if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount()) return ExpectedTC; return std::nullopt; @@ -1789,12 +1789,15 @@ class GeneratedRTChecks { Loop *OuterLoop = nullptr; + PredicatedScalarEvolution &PSE; + public: - GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI, - TargetTransformInfo *TTI, const DataLayout &DL, - bool AddBranchWeights) - : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"), - MemCheckExp(SE, DL, "scev.check"), AddBranchWeights(AddBranchWeights) {} + GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT, + LoopInfo *LI, TargetTransformInfo *TTI, + const DataLayout &DL, bool AddBranchWeights) + : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"), + MemCheckExp(*PSE.getSE(), DL, "scev.check"), + AddBranchWeights(AddBranchWeights), PSE(PSE) {} /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can /// accurately estimate the cost of the runtime checks. The blocks are @@ -1941,7 +1944,7 @@ public: // Get the best known TC estimate. if (auto EstimatedTC = getSmallBestKnownTC( - *SE, OuterLoop, /* CanUseConstantMax = */ false)) + PSE, OuterLoop, /* CanUseConstantMax = */ false)) BestTripCount = *EstimatedTC; BestTripCount = std::max(BestTripCount, 1U); @@ -2272,8 +2275,7 @@ static bool isIndvarOverflowCheckKnownFalse( // We know the runtime overflow check is known false iff the (max) trip-count // is known and (max) trip-count + (VF * UF) does not overflow in the type of // the vector loop induction variable. - if (unsigned TC = - Cost->PSE.getSE()->getSmallConstantMaxTripCount(Cost->TheLoop)) { + if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) { uint64_t MaxVF = VF.getKnownMinValue(); if (VF.isScalable()) { std::optional MaxVScale = @@ -3962,8 +3964,10 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { } unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop); - unsigned MaxTC = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); + unsigned MaxTC = PSE.getSmallConstantMaxTripCount(); LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n'); + if (TC != MaxTC) + LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n'); if (TC == 1) { reportVectorizationFailure("Single iteration (non) loop", "loop trip count is one, irrelevant for vectorization", @@ -4257,7 +4261,7 @@ bool LoopVectorizationPlanner::isMoreProfitable( InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); + unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount(); // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); @@ -4852,7 +4856,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, if (!Legal->isSafeForAnyVectorWidth()) return 1; - auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop); + auto BestKnownTC = getSmallBestKnownTC(PSE, TheLoop); const bool HasReductions = !Legal->getReductionVars().empty(); // If we did not calculate the cost for VF (because the user selected the VF) @@ -9618,8 +9622,8 @@ static bool processLoopInVPlanNativePath( { bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getDataLayout(), AddBranchWeights); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), + AddBranchWeights); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, VF.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" @@ -9683,7 +9687,7 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) { static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, std::optional VScale, Loop *L, - ScalarEvolution &SE, + PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL) { InstructionCost CheckCost = Checks.getCost(); if (!CheckCost.isValid()) @@ -9768,7 +9772,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, // Skip vectorization if the expected trip count is less than the minimum // required trip count. - if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) { + if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) { if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC), VF.MinProfitableTripCount)) { LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected " @@ -9875,7 +9879,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. - auto ExpectedTC = getSmallBestKnownTC(*SE, L); + auto ExpectedTC = getSmallBestKnownTC(PSE, L); if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) { LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is worth vectorizing only if no scalar " @@ -9973,8 +9977,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { bool AddBranchWeights = hasBranchWeightMD(*L->getLoopLatch()->getTerminator()); - GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, - F->getDataLayout(), AddBranchWeights); + GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), + AddBranchWeights); if (LVP.hasPlanWithVF(VF.Width)) { // Select the interleave count. IC = CM.selectInterleaveCount(VF.Width, VF.Cost); @@ -9990,7 +9994,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, - *PSE.getSE(), SEL)) { + PSE, SEL)) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll new file mode 100644 index 000000000000..1ec384b05779 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll @@ -0,0 +1,397 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; REQUIRES: asserts +; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -mattr=+sve 2>%t | FileCheck %s +; RUN: cat %t | FileCheck %s --check-prefix=DEBUG + +target triple = "aarch64-unknown-linux-gnu" + +; DEBUG-LABEL: LV: Checking a loop in 'low_vf_ic_is_better' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 19 +; DEBUG: LV: IC is 1 +; DEBUG: LV: VF is vscale x 8 +; DEBUG: Main Loop VF:vscale x 8, Main Loop UF:1, Epilogue Loop VF:vscale x 4, Epilogue Loop UF:1 + +; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small' +; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred. +; DEBUG: LV: Not vectorizing: The trip count is below the minial threshold value.. + +; DEBUG-LABEL: LV: Checking a loop in 'too_many_runtime_checks' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 16 +; DEBUG: LV: Clamping the MaxVF to maximum power of two not exceeding the constant trip count: 16 +; DEBUG: LV: IC is 1 +; DEBUG: LV: VF is 16 +; DEBUG: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 32) +; DEBUG: LV: Too many memory checks needed. + +; DEBUG-LABEL: LV: Checking a loop in 'overflow_indvar_known_false' +; DEBUG: LV: Found trip count: 0 +; DEBUG: LV: Found maximum trip count: 1027 +; DEBUG: LV: can fold tail by masking. +; DEBUG: Executing best plan with VF=vscale x 16, UF=1 + +define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef %val) { +; CHECK-LABEL: define void @low_vf_ic_is_better( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 19 +; CHECK-NEXT: br i1 [[CMP7]], label %[[ITER_CHECK:.*]], label %[[WHILE_END:.*]] +; CHECK: [[ITER_CHECK]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = sub i64 20, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP5]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = sub i64 19, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP10]], [[TMP6]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt i64 [[TMP8]], 4294967295 +; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP11]], [[TMP12]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 8 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP3]], [[TMP15]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP17]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP21]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP22]], align 1 +; CHECK-NEXT: [[TMP23:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: store [[TMP23]], ptr [[TMP22]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[IND_END5:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[TMP3]], [[N_VEC]] +; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP33]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[TMP3]], [[TMP35]] +; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF3]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC4]] +; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP36]], 4 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector [[BROADCAST_SPLATINSERT8]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX7:%.*]] = add i64 [[TMP0]], [[INDEX6]] +; CHECK-NEXT: [[TMP38:%.*]] = add i64 [[OFFSET_IDX7]], 0 +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP38]] +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP39]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP40]], align 1 +; CHECK-NEXT: [[TMP41:%.*]] = add [[WIDE_LOAD7]], [[BROADCAST_SPLAT9]] +; CHECK-NEXT: store [[TMP41]], ptr [[TMP40]], align 1 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX6]], [[TMP37]] +; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[TMP42]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]] +; CHECK-NEXT: br i1 [[CMP_N12]], label %[[WHILE_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ], [ [[TMP0]], %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP43]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 19 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 19 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 19 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @trip_count_too_small(ptr nocapture noundef %p, i32 noundef %tc, i16 noundef %val) { +; CHECK-LABEL: define void @trip_count_too_small( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 3 +; CHECK-NEXT: br i1 [[CMP7]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP43]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP44]], 3 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 3 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 3 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @too_many_runtime_checks(ptr nocapture noundef %p, ptr nocapture noundef %p1, ptr nocapture noundef readonly %p2, ptr nocapture noundef readonly %p3, i32 noundef %tc, i16 noundef %val) { +; CHECK-LABEL: define void @too_many_runtime_checks( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], ptr nocapture noundef [[P1:%.*]], ptr nocapture noundef readonly [[P2:%.*]], ptr nocapture noundef readonly [[P3:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP20:%.*]] = icmp ult i32 [[TC]], 16 +; CHECK-NEXT: br i1 [[CMP20]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV8:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[TMP1]], %[[WHILE_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[P2]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP60:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P3]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP61:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP61]], [[TMP60]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P1]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP62:%.*]] = load i8, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[MUL]], [[TMP62]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP63:%.*]] = load i8, ptr [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[ADD12:%.*]] = add i8 [[TMP63]], [[CONV8]] +; CHECK-NEXT: store i8 [[ADD12]], ptr [[ARRAYIDX10]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[TMP64:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP64]], 16 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[WHILE_BODY]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp20 = icmp ult i32 %tc, 16 + br i1 %cmp20, label %while.preheader, label %while.end + +while.preheader: + %0 = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %1 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %1, %while.preheader ], [ %iv.next, %while.body ] + %arrayidx = getelementptr inbounds nuw i8, ptr %p2, i64 %iv + %2 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %p3, i64 %iv + %3 = load i8, ptr %arrayidx2, align 1 + %mul = mul i8 %3, %2 + %arrayidx5 = getelementptr inbounds nuw i8, ptr %p1, i64 %iv + %4 = load i8, ptr %arrayidx5, align 1 + %add = add i8 %mul, %4 + store i8 %add, ptr %arrayidx5, align 1 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %5 = load i8, ptr %arrayidx10, align 1 + %add12 = add i8 %5, %0 + store i8 %add12, ptr %arrayidx10, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %6 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %6, 16 + br i1 %exitcond.not, label %while.end, label %while.body + +while.end: + ret void +} + +define void @overflow_indvar_known_false(ptr nocapture noundef %p, i32 noundef %tc, i16 noundef %val) vscale_range(1,16) { +; CHECK-LABEL: define void @overflow_indvar_known_false( +; CHECK-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[TC:%.*]], i16 noundef [[VAL:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[CMP7:%.*]] = icmp ult i32 [[TC]], 1027 +; CHECK-NEXT: br i1 [[CMP7]], label %[[WHILE_PREHEADER:.*]], label %[[WHILE_END:.*]] +; CHECK: [[WHILE_PREHEADER]]: +; CHECK-NEXT: [[CONV:%.*]] = trunc i16 [[VAL]] to i8 +; CHECK-NEXT: [[V:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4 +; CHECK-NEXT: [[TMP0:%.*]] = zext nneg i32 [[TC]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = sub i64 1028, [[TMP20]] +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TC]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = sub i64 1027, [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[TMP23]] to i32 +; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP21]], [[TMP24]] +; CHECK-NEXT: [[TMP26:%.*]] = icmp ult i32 [[TMP25]], [[TMP21]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp ugt i64 [[TMP23]], 4294967295 +; CHECK-NEXT: [[TMP28:%.*]] = or i1 [[TMP26]], [[TMP27]] +; CHECK-NEXT: br i1 [[TMP28]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], [[TMP4]] +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 16 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP1]]) +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[CONV]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[TMP0]], [[INDEX]] +; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP15:%.*]] = add [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP15]], ptr [[TMP14]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP1]]) +; CHECK-NEXT: [[TMP16:%.*]] = xor [[ACTIVE_LANE_MASK_NEXT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[TMP16]], i32 0 +; CHECK-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br i1 true, label %[[WHILE_END_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[TMP0]], %[[WHILE_PREHEADER]] ], [ [[TMP0]], %[[VECTOR_SCEVCHECK]] ] +; CHECK-NEXT: br label %[[WHILE_BODY:.*]] +; CHECK: [[WHILE_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[WHILE_BODY]] ] +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[V]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP18]], [[CONV]] +; CHECK-NEXT: store i8 [[ADD]], ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[TMP29:%.*]] = and i64 [[INDVARS_IV_NEXT]], 4294967295 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[TMP29]], 1027 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[WHILE_END_LOOPEXIT]], label %[[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[WHILE_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[WHILE_END]] +; CHECK: [[WHILE_END]]: +; CHECK-NEXT: ret void +; +entry: + %cmp7 = icmp ult i32 %tc, 1027 + br i1 %cmp7, label %while.preheader, label %while.end + +while.preheader: + %conv = trunc i16 %val to i8 + %v = getelementptr inbounds nuw i8, ptr %p, i64 4 + %0 = zext nneg i32 %tc to i64 + br label %while.body + +while.body: + %iv = phi i64 [ %0, %while.preheader ], [ %iv.next, %while.body ] + %iv.next = add nuw nsw i64 %iv, 1 + %arrayidx = getelementptr inbounds nuw i8, ptr %v, i64 %iv + %1 = load i8, ptr %arrayidx, align 1 + %add = add i8 %1, %conv + store i8 %add, ptr %arrayidx, align 1 + %2 = and i64 %iv.next, 4294967295 + %exitcond.not = icmp eq i64 %2, 1027 + br i1 %exitcond.not, label %while.end, label %while.body, !llvm.loop !0 + +while.end: + ret void +} + + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]} +; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]} +; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index 1d5e6c117a2e..9a716f775607 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -20,6 +20,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: LV: Found trip count: 0 +; CHECK-NEXT: LV: Found maximum trip count: 4294967295 ; CHECK-NEXT: LV: Scalable vectorization is available ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. @@ -224,6 +225,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! ; CHECK-NEXT: LV: Loop does not require scalar epilogue ; CHECK-NEXT: LV: Found trip count: 0 +; CHECK-NEXT: LV: Found maximum trip count: 4294967295 ; CHECK-NEXT: LV: Scalable vectorization is available ; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. ; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. -- GitLab From 1276ce9e9713b2a0802004676fad7e40980396d5 Mon Sep 17 00:00:00 2001 From: Emilio Cota Date: Fri, 11 Oct 2024 05:08:23 -0400 Subject: [PATCH 010/345] Revert "[mlir][linalg] Introduce transpose semantic to 'linalg.matmul' ops. (#104783)" This reverts commit 03483737a7a2d72a257a5ab6ff01748ad9cf0f75 and 99c8557, which is a fix-up on top of the former. I'm reverting because this commit broke two tests: mlir/test/python/integration/dialects/linalg/opsrun.py mlir/test/python/integration/dialects/transform.py See https://lab.llvm.org/buildbot/#/builders/138/builds/4872 I'm not familiar with the tests, so I'm leaving it to the original author to either remove or adapt the broken tests, as discussed here: https://github.com/llvm/llvm-project/pull/104783#issuecomment-2406390905 --- .../Dialect/Linalg/IR/LinalgInterfaces.td | 10 - .../Linalg/IR/LinalgNamedStructuredOps.yaml | 72 +++++ .../Dialect/Linalg/IR/LinalgStructuredOps.td | 134 --------- .../Dialect/Linalg/IR/LinalgInterfaces.cpp | 17 +- mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 263 +----------------- .../Linalg/Transforms/TransposeMatmul.cpp | 7 - .../Linalg/Transforms/Vectorization.cpp | 5 - .../NVGPU/TransformOps/NVGPUTransformOps.cpp | 6 - .../linalg/opdsl/ops/core_named_ops.py | 17 ++ .../Dialect/Linalg/generalize-named-ops.mlir | 111 -------- mlir/test/Dialect/Linalg/invalid.mlir | 159 ----------- mlir/test/Dialect/Linalg/named-ops.mlir | 243 ---------------- mlir/test/python/dialects/linalg/ops.py | 75 +++++ .../mlir-linalg-ods-yaml-gen.cpp | 6 +- 14 files changed, 182 insertions(+), 943 deletions(-) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td index e80dbb2afb9e..fbf3f19cde0e 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td @@ -684,16 +684,6 @@ def LinalgStructuredInterface return; }] >, - InterfaceMethod< - /*desc=*/[{ - Return true if the user has supplied an explicit indexing maps for this op. - }], - /*retTy=*/"bool", - /*methodName=*/"hasUserDefinedMaps", - /*args=*/(ins), - /*methodBody=*/"", - /*defaultImplementation=*/[{ return false; }] - >, //===------------------------------------------------------------------===// // Linalg generalization hooks. //===------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml index 97b90333e2b2..8cb698096ef5 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgNamedStructuredOps.yaml @@ -1065,6 +1065,78 @@ structured_op: !LinalgStructuredOpConfig - !ScalarExpression scalar_arg: rhs --- !LinalgOpConfig +metadata: !LinalgOpMetadata + name: matmul + cpp_class_name: MatmulOp + doc: |- + Performs a matrix multiplication of two 2D inputs. + + Numeric casting is performed on the operands to the inner multiply, promoting + them to the same data type as the accumulator/output. + implements: + - LinalgContractionOpInterface +structured_op: !LinalgStructuredOpConfig + args: + - !LinalgOperandDefConfig + name: A + kind: input_tensor + type_var: T1 + shape_map: affine_map<()[s0, s1, s2] -> (s0, s1)> + - !LinalgOperandDefConfig + name: B + kind: input_tensor + type_var: T2 + shape_map: affine_map<()[s0, s1, s2] -> (s1, s2)> + - !LinalgOperandDefConfig + name: C + kind: output_tensor + type_var: U + shape_map: affine_map<()[s0, s1, s2] -> (s0, s2)> + - !LinalgOperandDefConfig + name: cast + kind: type_fn_attr + default_fn: cast_signed + indexing_maps: !LinalgIndexingMapsConfig + static_indexing_maps: + - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d2)> + - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2, d1)> + - affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0, d1)> + iterator_types: + - parallel + - parallel + - reduction + assignments: + - !ScalarAssign + arg: C + value: !ScalarExpression + scalar_fn: + kind: binary + fn_name: add + operands: + - !ScalarExpression + scalar_arg: C + - !ScalarExpression + scalar_fn: + kind: binary + fn_name: mul + operands: + - !ScalarExpression + scalar_fn: + kind: type + attr_name: cast + type_var: U + operands: + - !ScalarExpression + scalar_arg: A + - !ScalarExpression + scalar_fn: + kind: type + attr_name: cast + type_var: U + operands: + - !ScalarExpression + scalar_arg: B +--- !LinalgOpConfig metadata: !LinalgOpMetadata name: quantized_matmul cpp_class_name: QuantizedMatmulOp diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 61d4fc9734c6..31f291392472 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -535,140 +535,6 @@ def BroadcastOp : LinalgStructuredBase_Op<"broadcast", [ let hasCanonicalizer = 1; } -//===----------------------------------------------------------------------===// -// Op definition for MatmulOp -//===----------------------------------------------------------------------===// - -def MatmulOp : LinalgStructuredBase_Op<"matmul", [ - AttrSizedOperandSegments, - LinalgContractionOpInterface]> { - - let summary = [{ - Performs a matrix multiplication of two 2D inputs without broadcast or transpose. - }]; - let description = [{ - Numeric casting is performed on the operands to the inner multiply, - promoting them to the same data type as the accumulator/output. - - Broadcast and Transpose semantics can be appiled by specifying the explicit attribute - 'indexing_maps' as shown below.This is a list attribute, so the list must include all - the maps if specified. - - Example Transpose: - ``` - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>,memref<5x7xf32>) - outs(%arg2: memref<3x7xf32>) - ``` - - Example Broadcast: - ``` - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, // broadcast - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) - outs(%arg2: memref<3x7xf32>) - ``` - - Example Broadcast and transpose: - ``` - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, // transpose - affine_map<(d0, d1, d2) -> (d2)>, // broadcast - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) - }]; - - let arguments = (ins - Variadic:$inputs, - Variadic:$outputs, - DefaultValuedOptionalAttr:$indexing_maps, - DefaultValuedOptionalAttr:$cast - ); - let results = (outs Variadic:$result_tensors); - let regions = (region AnyRegion:$region); - - let skipDefaultBuilders = 1; - let builders = [ - OpBuilder< - (ins "ValueRange":$inputs, "ValueRange":$outputs, - CArg<"ArrayRef", "{}">:$attributes), - [{ - buildStructuredOp($_builder, $_state, std::nullopt, inputs, outputs, - attributes, MatmulOp::getRegionBuilder()); - }]>, - OpBuilder< - (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs, - "ValueRange":$outputs, - CArg<"ArrayRef", "{}">:$attributes), - [{ - buildStructuredOp($_builder, $_state, resultTensorTypes, - inputs, outputs, attributes, MatmulOp::getRegionBuilder()); - }]>, - OpBuilder< - (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands, - CArg<"ArrayRef", "{}">:$attributes), - [{ - $_state.addOperands(operands); - $_state.addAttributes(attributes); - $_state.addTypes(resultTensorTypes); - (void)$_state.addRegion(); - }]>, - OpBuilder< - (ins "TypeRange":$resultTensorTypes, "ValueRange":$inputs, - "ValueRange":$outputs, - "Attribute":$cast, CArg<"ArrayRef", "{}">:$attributes), - [{ - $_state.addAttribute("cast", cast); - buildStructuredOp($_builder, $_state, resultTensorTypes, inputs, outputs, - attributes, MatmulOp::getRegionBuilder()); - }]> - - ]; - let hasCustomAssemblyFormat = 1; - let hasFolder = 1; - let hasVerifier = 1; - - let extraClassDeclaration = structuredOpsBaseDecls # [{ - SmallVector getIteratorTypesArray(); - - /// Implements the block region builder. - static void regionBuilder(ImplicitLocOpBuilder &b, - Block &block, ArrayRef attrs); - - /// Returns a list of AffineMap with the typical matmul indexing charactristic. - SmallVector getDefaultIndexingMaps(); - - /// Returns true if the given broadcast map \p bcastMap is valid for this op. - bool isValidLhsRhsBroadcastMap(AffineMap bcastMap); - - static std::function)> - getRegionBuilder() { - return regionBuilder; - } - - ::mlir::MutableOperandRange getDpsInitsMutable() { - return getOutputsMutable(); - } - - // Generic methods. - static unsigned getNumRegionArgs(); - std::string getLibraryCallName(); - bool hasDynamicIndexingMaps(); - /// Check if the op has broadcast and/or transpose semantic. Returns true if the - /// user defined indexing maps are not equal to default map. - bool hasUserDefinedMaps(); - }]; -} - //===----------------------------------------------------------------------===// // Named Linalg ops, implemented as a declarative configurations of generic ops. //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp index 3b9194098fa7..40795879c302 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp @@ -15,20 +15,13 @@ #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" -#include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" -#include "mlir/IR/BuiltinTypeInterfaces.h" -#include "mlir/IR/MLIRContext.h" #include "mlir/IR/TypeUtilities.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/Support/Casting.h" -#include "llvm/Support/raw_ostream.h" #include -#include using namespace mlir; using namespace mlir::linalg; @@ -1149,6 +1142,7 @@ int64_t LinalgOp::getIndexingMapIndex(OpOperand *opOperand) { LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { LinalgOp linalgOp = cast(op); + // Mixed tensor/buffer operands are not allowed. if (!linalgOp.hasPureTensorSemantics() && !linalgOp.hasPureBufferSemantics() && op->getNumOperands() > 0) @@ -1168,8 +1162,6 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { << ") to be equal to the number of input/output operands (" << linalgOp->getNumOperands() << ")"; - // Set this flag if this op has user defined maps. This is required to guard - // the below error condition which assume default indexing maps. for (OpOperand &opOperand : linalgOp->getOpOperands()) { AffineMap indexingMap = linalgOp.getMatchingIndexingMap(&opOperand); @@ -1186,13 +1178,13 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { << " dim(s) to match the number of loops"; int64_t rank = linalgOp.getRank(&opOperand); - if (indexingMap.getNumResults() != rank) return op->emitOpError("expected operand rank (") << rank << ") to match the result rank of indexing_map #" << opOperand.getOperandNumber() << " (" << indexingMap.getNumResults() << ")"; } + SmallVector redDims; linalgOp.getReductionDims(redDims); @@ -1202,8 +1194,9 @@ LogicalResult mlir::linalg::detail::verifyStructuredOpInterface(Operation *op) { // Check if given shapes match to inferred shapes. SmallVector endLoopRangeValues = linalgOp.getStaticLoopRanges(); SmallVector startLoopRangeValues(endLoopRangeValues.size(), 0); - // Verify only static cases since we can't get exact dimension sizes and - // loop ranges for dynamic cases in this stage. + + // Verify only static cases since we can't get exact dimension sizes and loop + // ranges for dynamic cases in this stage. if (llvm::none_of(endLoopRangeValues, ShapedType::isDynamic)) { for (int64_t &range : endLoopRangeValues) range -= 1; diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index c909d13e4314..730c478c2883 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -27,7 +27,6 @@ #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/AffineMap.h" -#include "mlir/IR/Attributes.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/BuiltinTypeInterfaces.h" #include "mlir/IR/Matchers.h" @@ -38,17 +37,12 @@ #include "mlir/Interfaces/SideEffectInterfaces.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringSet.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Support/FormatVariadic.h" -#include "llvm/Support/LogicalResult.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include #include using namespace mlir; @@ -155,36 +149,15 @@ static void fillStructuredOpRegion(OpBuilder &opBuilder, Region ®ion, // iterator_types is an auto-generated method. } -/// Helper to create a typical indexing map for MatmulOp. Returns a list of -/// AffineMap. -static SmallVector -getDefaultIndexingMapsForMatmul(MLIRContext *context) { - AffineExpr d0, d1, d2; - SmallVector indexingMaps; - bindDims(context, d0, d1, d2); - indexingMaps.push_back(AffineMap::get(3, 0, {d0, d2}, context)); - indexingMaps.push_back(AffineMap::get(3, 0, {d2, d1}, context)); - indexingMaps.push_back(AffineMap::get(3, 0, {d0, d1}, context)); - return indexingMaps; -} - -/// Wrapper to return the typical indexing map array attribute for MatmulOp. -static SmallVector getDefaultIndexingMapAttr(MLIRContext *context) { - return llvm::map_to_vector( - getDefaultIndexingMapsForMatmul(context), - [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); }); -} - /// Creates a structured operation given `inputs`, `outputs`, and `attributes`. /// The result types are derived automatically if `resultTensorTypes` is none. /// The body of the operation is filled using `regionBuilder`. All ods-gen /// created structured operations use the method to implement their builders. -static void buildStructuredOp( - OpBuilder &b, OperationState &state, - std::optional resultTensorTypes, ValueRange inputs, - ValueRange outputs, ArrayRef attributes, - RegionBuilderFn regionBuilder, - std::optional> indexingMaps = std::nullopt) { +static void buildStructuredOp(OpBuilder &b, OperationState &state, + std::optional resultTensorTypes, + ValueRange inputs, ValueRange outputs, + ArrayRef attributes, + RegionBuilderFn regionBuilder) { // Derive the result types if needed. SmallVector derivedResultTypes = resultTensorTypes.value_or(TypeRange()); @@ -195,20 +168,6 @@ static void buildStructuredOp( state.addOperands(inputs); state.addOperands(outputs); state.addTypes(derivedResultTypes); - - // Initialize indexingMaps, for MatmulOp. - SmallVector indexingMapsAttrVal; - if (indexingMaps.has_value()) { - for (mlir::AffineMap map : *indexingMaps) { - // Convert each AffineMap to an AffineMapAttr - indexingMapsAttrVal.push_back(AffineMapAttr::get(map)); - } - state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal)); - } else { - indexingMapsAttrVal = getDefaultIndexingMapAttr(b.getContext()); - state.addAttribute("indexing_maps", b.getArrayAttr(indexingMapsAttrVal)); - } - state.addAttributes(attributes); state.addAttribute( "operandSegmentSizes", @@ -340,48 +299,11 @@ static ParseResult parseNamedStructuredOp(OpAsmParser &parser, OperationState &result, unsigned numRegionArgs, RegionBuilderFn regionBuilder) { - - SmallVector indexingMapsAttr; - Attribute mapAttr; - if (succeeded(parser.parseOptionalKeyword("indexing_maps"))) { - if (parser.parseEqual()) - return failure(); - - if (parser.parseLSquare()) - return failure(); - - do { - if (parser.parseAttribute(mapAttr)) - return failure(); - if (!isa(mapAttr)) { - return parser.emitError(parser.getCurrentLocation(), - "expected affine map attribute"); - } - indexingMapsAttr.push_back(mapAttr); - - if (parser.parseOptionalComma()) - break; - } while (true); - - if (parser.parseRSquare()) - return failure(); - } - // Initialize indexingMaps, if not supplied explicitly. - if (indexingMapsAttr.empty()) { - indexingMapsAttr = getDefaultIndexingMapAttr(result.getContext()); - } - result.addAttribute("indexing_maps", - parser.getBuilder().getArrayAttr(indexingMapsAttr)); - // TODO: Enable when ods-gen supports captures. SmallVector inputTypes, outputTypes; if (parseCommonStructuredOpParts(parser, result, inputTypes, outputTypes)) return failure(); - // Parse optional attributes. - if (parser.parseOptionalAttrDict(result.attributes)) - return failure(); - // TODO: consider merging results parsing into region parsing. // Need to wait for declarative assembly resolution to decide. SmallVector outputTensorsTypes; @@ -407,9 +329,13 @@ static void printNamedStructuredOpResults(OpAsmPrinter &p, } static void printNamedStructuredOp(OpAsmPrinter &p, Operation *op, - ValueRange inputs, ValueRange outputs, - ArrayRef elidedAttrs = {}) { - p.printOptionalAttrDict(op->getAttrs(), elidedAttrs); + ValueRange inputs, ValueRange outputs) { + p.printOptionalAttrDict( + op->getAttrs(), + /*elidedAttrs=*/{"operandSegmentSizes", + // See generated code in + // LinalgNamedStructuredOps.yamlgen.cpp.inc + "linalg.memoized_indexing_maps"}); // Printing is shared with generic ops, except for the region and // attributes. @@ -3456,168 +3382,3 @@ Operation *LinalgDialect::materializeConstant(OpBuilder &builder, Location loc) { return arith::ConstantOp::materialize(builder, value, type, loc); } - -/// Returns true if the result AffineExpr of the \p explicitMap is same as \p -/// defaultMap. -static bool isValidResultDimExprs(AffineMap explictMap, AffineMap defaultMap) { - auto explicitRange = explictMap.getResults(); - auto defaultRange = defaultMap.getResults(); - DenseSet explicitSet(explicitRange.begin(), explicitRange.end()); - DenseSet defaultSet(defaultRange.begin(), defaultRange.end()); - llvm::set_union(explicitSet, defaultSet); - return explicitSet == defaultSet; -} - -/// Returns true if the \p explictMap is broadcasted with respect to the -/// \p defaultMap. -static bool isBroadcasted(AffineMap explictMap, AffineMap defaultMap) { - return explictMap.getNumResults() < defaultMap.getNumResults(); -} - -/// Verifies the broadcast and transpose semantic sepecified by the explicit -/// indexing map for the MatmulOp \p op for each operand specified by \p -/// opIndex. -static LogicalResult verifyExtendedMatmulSemantic(MatmulOp matmulOp, - unsigned opIndex) { - SmallVector opIndexingMaps = matmulOp.getIndexingMapsArray(); - SmallVector defaultIndexingMaps = - matmulOp.getDefaultIndexingMaps(); - - auto opIndexingMap = opIndexingMaps[opIndex]; - auto defaultIndexingMap = defaultIndexingMaps[opIndex]; - // Check general validity of indexing map results. - if (!isValidResultDimExprs(opIndexingMap, defaultIndexingMap)) - return matmulOp->emitOpError() - << "Unexpected dim expression in map result."; - - // Check if the requested broadcast is valid. - if (isBroadcasted(opIndexingMap, defaultIndexingMap)) { - if (!matmulOp.isValidLhsRhsBroadcastMap(opIndexingMap)) { - return matmulOp->emitOpError() - << "Invalid broadcast requested, should be (d2)."; - } - return success(); - } - return success(); -} - -namespace mlir { -namespace linalg { -//===----------------------------------------------------------------------===// -// MatMulOp -//===----------------------------------------------------------------------===// -SmallVector MatmulOp::getIteratorTypesArray() { - return SmallVector{utils::IteratorType::parallel, - utils::IteratorType::parallel, - utils::IteratorType::reduction}; -} - -unsigned MatmulOp::getNumRegionArgs() { return 3; } - -std::string MatmulOp::getLibraryCallName() { - return generateLibraryCallName(getOperation()); -} - -bool MatmulOp::hasDynamicIndexingMaps() { return true; } - -/// Check if the op has broadcast and/or transpose semantic. Returns true if the -/// user defined indexing maps are not equal to default map. -bool MatmulOp::hasUserDefinedMaps() { - SmallVector defaultMaps = getDefaultIndexingMaps(); - SmallVector explicitMaps = getIndexingMapsArray(); - return defaultMaps != explicitMaps; -} - -/// Implements the block region builder for the MatmulOp. This is called by -/// 'fillStructuredOpRegion'. -void MatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block, - ArrayRef attrs) { - assert(3 > 0 && block.getNumArguments() == 3 && - "MatmulOp regionBuilder expects 3 (>=0) args"); - RegionBuilderHelper helper(b, block); - SmallVector yields; - - TypeFn castVal = TypeFn::cast_signed; - auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) { - return attr.getName() == "cast"; - }); - if (castIter != attrs.end()) { - if (auto attr = llvm::dyn_cast(castIter->getValue())) - castVal = attr.getValue(); - } - - Value value1 = helper.buildTypeFn(castVal, block.getArgument(2).getType(), - block.getArgument(0)); - Value value2 = helper.buildTypeFn(castVal, block.getArgument(2).getType(), - block.getArgument(1)); - Value value3 = helper.buildBinaryFn(BinaryFn::mul, value1, value2); - Value value4 = - helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), value3); - yields.push_back(value4); - helper.yieldOutputs(yields); -} - -/// Returns a list of AffineMap with the typical matmul indexing charactristic. -SmallVector MatmulOp::getDefaultIndexingMaps() { - MLIRContext *context = this->getContext(); - return getDefaultIndexingMapsForMatmul(context); -} - -/// Returns true if the given broadcast map \p bcastMap is valid for this op. -bool MatmulOp::isValidLhsRhsBroadcastMap(AffineMap bcastMap) { - assert(bcastMap.getNumResults() == 1 && "Expected single result dim expr."); - AffineExpr exp = bcastMap.getResult(0); - // Invalid map if the common dimension of matmul not found. - return exp.isFunctionOfDim(bcastMap.getNumDims() - 1); -} - -ParseResult MatmulOp::parse(OpAsmParser &parser, OperationState &result) { - return parseNamedStructuredOp(parser, result, MatmulOp::getNumRegionArgs(), - MatmulOp::getRegionBuilder()); -} -void MatmulOp::print(OpAsmPrinter &p) { - SmallVector elidedAttrs = { - "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; - printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), - elidedAttrs); - - SmallVector indexingMaps = - getDefaultIndexingMapAttr(getContext()); - if (!llvm::equal(getIndexingMaps(), indexingMaps)) { - p << " indexing_maps = ["; - llvm::interleaveComma(getIndexingMaps(), p, - [&](Attribute attr) { p.printAttribute(attr); }); - p << "]"; - } -} - -/// Verify the user defined indexing maps. -LogicalResult MatmulOp::verify() { - // Verification of pure matmul is handled by verifyStructuredOpInterface(). - if (!hasUserDefinedMaps()) - return success(); - - for (unsigned opIndex = 0; opIndex < 2; opIndex++) { - if (failed(verifyExtendedMatmulSemantic(*this, opIndex))) - return failure(); - } - return success(); -} - -LogicalResult MatmulOp::fold(FoldAdaptor, SmallVectorImpl &) { - return memref::foldMemRefCast(*this); -} -void MatmulOp::getEffects( - SmallVectorImpl> - &effects) { - if (hasPureTensorSemantics()) - return; - getGenericEffectsImpl(effects, cast(getOperation())); -} - -Speculation::Speculatability MatmulOp::getSpeculatability() { - return getGenericSpeculatabilityImpl(cast(getOperation())); -} - -} // namespace linalg -} // namespace mlir diff --git a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp index 6b934f7e8157..aa0052ce47fa 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/TransposeMatmul.cpp @@ -31,13 +31,6 @@ using namespace mlir::linalg; FailureOr mlir::linalg::transposeMatmul(RewriterBase &rewriter, linalg::MatmulOp matmulOp, bool transposeLHS) { - // Check to not let go the matmul with extended semantic, through this - // transform. - if (matmulOp.hasUserDefinedMaps()) { - return rewriter.notifyMatchFailure( - matmulOp, "only matmul ops with non-extended semantics are supported"); - } - if (!bufferization::hasTensorSemantics(matmulOp)) return rewriter.notifyMatchFailure( matmulOp, "only matmul ops with tensors are supported"); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp index e3f010d9cfb2..09c6b2683b43 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp @@ -2071,11 +2071,6 @@ vectorizeScalableVectorPrecondition(Operation *op, return failure(); } - // Check to not let go the matmul with extended semantic, through this - // transform. - if (linalgOp.hasUserDefinedMaps()) - return failure(); - // Cond 4: Only the following ops are supported in the // presence of scalable vectors return success(isElementwise(linalgOp) || isa(op) || diff --git a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp index 3c508ed6e324..0c2275bbc4b2 100644 --- a/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp +++ b/mlir/lib/Dialect/NVGPU/TransformOps/NVGPUTransformOps.cpp @@ -821,12 +821,6 @@ DiagnosedSilenceableFailure transform::RewriteMatmulAsMmaSyncOp::applyToOne( bool fail = true; // TODO: more robust detection of matmulOp, with transposes etc. if (isa_and_nonnull(linalgOp.getOperation())) { - // Check to not let go the matmul with extended semantic, through this - // transform. - if (linalgOp.hasUserDefinedMaps()) { - return emitSilenceableError() - << "only matmul ops with non-extended semantics are supported"; - } Location loc = linalgOp.getLoc(); // TODO: more robust computation of laneId, for now assume a single warp. Value laneId = rewriter.create( diff --git a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py index d5e79b4d3cb6..e4a6ec7487bb 100644 --- a/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py +++ b/mlir/python/mlir/dialects/linalg/opdsl/ops/core_named_ops.py @@ -383,6 +383,23 @@ def select( O[None] = TernaryFn.select(cond[None], lhs[None], rhs[None]) +@linalg_structured_op +def matmul( + A=TensorDef(T1, S.M, S.K), + B=TensorDef(T2, S.K, S.N), + C=TensorDef(U, S.M, S.N, output=True), + cast=TypeFnAttrDef(default=TypeFn.cast_signed), +): + """Performs a matrix multiplication of two 2D inputs. + + Numeric casting is performed on the operands to the inner multiply, promoting + them to the same data type as the accumulator/output. + """ + domain(D.m, D.n, D.k) + implements(ContractionOpInterface) + C[D.m, D.n] += cast(U, A[D.m, D.k]) * cast(U, B[D.k, D.n]) + + @linalg_structured_op def quantized_matmul( A=TensorDef(T1, S.M, S.K), diff --git a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir index aba26c35931f..1e8f1435ca0f 100644 --- a/mlir/test/Dialect/Linalg/generalize-named-ops.mlir +++ b/mlir/test/Dialect/Linalg/generalize-named-ops.mlir @@ -29,34 +29,6 @@ func.func @generalize_matmul_buffer(%A : memref<16x8xf32>, %B: memref<8x32xf32>, // ----- -func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func.func @matmul_bcast_a( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) { -// CHECK: ^bb0(%[[VAL_3:.*]]: f32, %[[VAL_4:.*]]: f32, %[[VAL_5:.*]]: f32): -// CHECK: %[[VAL_6:.*]] = arith.mulf %[[VAL_3]], %[[VAL_4]] : f32 -// CHECK: %[[VAL_7:.*]] = arith.addf %[[VAL_5]], %[[VAL_6]] : f32 -// CHECK: linalg.yield %[[VAL_7]] : f32 -// CHECK: } -// CHECK: return -// CHECK: } - -// ----- - func.func @generalize_matmul_tensor(%A : tensor<16x8xf32>, %B: tensor<8x32xf32>, %C: tensor<16x32xf32>) -> tensor<16x32xf32> { %0 = linalg.matmul ins(%A, %B: tensor<16x8xf32>, tensor<8x32xf32>) outs(%C: tensor<16x32xf32>) -> tensor<16x32xf32> @@ -919,86 +891,3 @@ func.func @fill_tensor(%f: f32, %v: vector<2x4xf32>) -> (tensor, tensor, tensor> } - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_transpose_a_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { - -// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} -// CHECK: arith.mulf -// CHECK: arith.addf - -func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func.func @matmul_transpose_b_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { - -// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} -// CHECK: arith.mulf -// CHECK: arith.addf - -func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// ----- - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_transpose_a_b_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { - -// CHECK: linalg.generic {indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]], iterator_types = ["parallel", "parallel", "reduction"]} -// CHECK: arith.mulf -// CHECK: arith.addf - -func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// ----- - diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir index b2869893b804..c481a723c562 100644 --- a/mlir/test/Dialect/Linalg/invalid.mlir +++ b/mlir/test/Dialect/Linalg/invalid.mlir @@ -361,165 +361,6 @@ func.func @invalid_static_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, // ----- -func.func @invalid_indexing_maps_matmul(%arg0: memref<2x4xf32>, %arg1: memref<3x4xf32>, %arg2: memref<2x4xf32>) { - // expected-error @+1 {{expected attribute value}} - linalg.matmul indexing_maps = [ - , - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<2x4xf32>, memref<3x4xf32>) - outs(%arg2 :memref<2x4xf32>) - return -} - -// ----- - -func.func @invalid_matmul_dim_a(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) { - // expected-error @+1 {{Unexpected dim expression in map result}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>) - return -} - -// ----- - -func.func @invalid_matmul_dim_b(%arg0: memref<5x5xf32>, %arg1: memref<5x5xf32>, %arg2: memref<5x5xf32>) { - // expected-error @+1 {{Unexpected dim expression in map result}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x5xf32>, memref<5x5xf32>) outs(%arg2: memref<5x5xf32>) - return -} - -// ----- - -func.func @invalid_transpose_a_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> { - // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 4, but found 1}} - %0 = linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) - outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32> - return %0: tensor<4x64xf32> -} - -// ----- - -func.func @invalid_transpose_b_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) -> tensor<4x64xf32> { - // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #1 to be 1, but found 64}} - %0 = linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) - outs(%init : tensor<4x64xf32>) -> tensor<4x64xf32> - return %0: tensor<4x64xf32> -} - -// ----- - -func.func @invalid_bcast_a(%arg0: memref<3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op Invalid broadcast requested, should be (d2)}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_bcast_a_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #0 (1)}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_bcast_b_rank_mismatch(%arg0: memref<3x5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op expected operand rank (2) to match the result rank of indexing_map #1 (1)}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<7xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{inferred input/output operand #1 has shape's dimension #0 to be 5, but found 7}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_matmul_bcast_b_transpose_a_wrong_dim(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - // expected-error @+1 {{'linalg.matmul' op Unexpected dim expression in map result.}} - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// ----- - -func.func @invalid_indexing_maps_placement_matmul(%lhs: tensor<4x1xf32>, %rhs: tensor<1x64xf32>, %init: tensor<4x64xf32>) { - // expected-error @+2 {{custom op 'indexing_maps' is unknown (tried 'func.indexing_maps' as well)}} - linalg.matmul ins(%lhs, %rhs : tensor<4x1xf32>, tensor<1x64xf32>) outs(%init : tensor<4x64xf32>) - indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - return -} - -// ----- - func.func @invalid_static_2d_conv(%input : memref<1x3x4x2xf32>, %filter: memref<3x2x2x1xf32>, %output: memref<1x2x3x1xf32>) { // expected-error @+1 {{inferred input/output operand #0 has shape's dimension #1 to be greater than or equal to 4, but found 3}} linalg.conv_2d_nhwc_hwcf diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 65c18de84247..02ecbed232c8 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1201,249 +1201,6 @@ func.func @matmul_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %a // ----- -// CHECK-LABEL: func @matmul_transpose_a_explicit -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5x3xf32>, memref<5x7xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) -func.func @matmul_transpose_a_explicit(%arg0: memref<5x3xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<5x7xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// ----- - -func.func @matmul_transpose_b_explicit(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<7x5xf32>) - outs(%arg2: memref<3x7xf32>) - - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_transpose_b_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_transpose_a_b_explicit(%arg0: memref<5x3xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<7x5xf32>) - outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_transpose_a_b_explicit( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_bcast_a(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func @matmul_bcast_a -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) - -// ----- - -func.func @matmul_bcast_a_dim1(%arg0: memref<5xf32>, %arg1: memref<5x7xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<5x7xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func @matmul_bcast_a_dim1 -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<5xf32>, memref<5x7xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) - -// ----- - -func.func @matmul_bcast_b(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func @matmul_bcast_b -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) - -// ----- - -func.func @matmul_bcast_a_b(%arg0: memref<5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_bcast_a_b( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, %[[VAL_1:.*]]: memref<5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_0]], #[[$ATTR_1]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_bcast_b_dim1(%arg0: memref<3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d0, d2)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<3x5xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> -// CHECK-LABEL: func @matmul_bcast_b_dim1 -// CHECK: linalg.matmul -// CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<5xf32>) -// CHECK-SAME: outs(%{{.+}} : memref<3x7xf32>) - -// ----- - -func.func @dynamic_matmul_bcast_a(%arg0: memref, %arg1: memref, %arg2: memref) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d2, d1)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref, memref) outs(%arg2: memref) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @dynamic_matmul_bcast_a( -// CHECK-SAME: %[[VAL_0:.*]]: memref, -// CHECK-SAME: %[[VAL_1:.*]]: memref, -// CHECK-SAME: %[[VAL_2:.*]]: memref) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref, memref) outs(%[[VAL_2]] : memref) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_bcast_a_transpose_b(%arg0: memref<5xf32>, %arg1: memref<7x5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d1, d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5xf32>, memref<7x5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d1, d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_bcast_a_transpose_b( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<7x5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<7x5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - -func.func @matmul_bcast_b_transpose_a(%arg0: memref<5x3xf32>, %arg1: memref<5xf32>, %arg2: memref<3x7xf32>) { - linalg.matmul indexing_maps = [ - affine_map<(d0, d1, d2) -> (d2, d0)>, - affine_map<(d0, d1, d2) -> (d2)>, - affine_map<(d0, d1, d2) -> (d0, d1)> - ] - ins(%arg0, %arg1 : memref<5x3xf32>, memref<5xf32>) outs(%arg2: memref<3x7xf32>) - return -} - -// CHECK: #[[$ATTR_0:.+]] = affine_map<(d0, d1, d2) -> (d2, d0)> -// CHECK: #[[$ATTR_1:.+]] = affine_map<(d0, d1, d2) -> (d2)> -// CHECK: #[[$ATTR_2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)> - -// CHECK-LABEL: func.func @matmul_bcast_b_transpose_a( -// CHECK-SAME: %[[VAL_0:.*]]: memref<5x3xf32>, -// CHECK-SAME: %[[VAL_1:.*]]: memref<5xf32>, -// CHECK-SAME: %[[VAL_2:.*]]: memref<3x7xf32>) { -// CHECK: linalg.matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5x3xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] -// CHECK: return -// CHECK: } - -// ----- - // CHECK-LABEL: func @matmul_transpose_b // CHECK: linalg.matmul_transpose_b // CHECK-SAME: ins(%{{.+}}, %{{.+}} : memref<3x5xf32>, memref<7x5xf32>) diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py index 72045a07b2da..3bfbcf7d7f7c 100644 --- a/mlir/test/python/dialects/linalg/ops.py +++ b/mlir/test/python/dialects/linalg/ops.py @@ -84,6 +84,81 @@ def testNamedStructuredOpCustomForm(): print(module) + +# CHECK-LABEL: TEST: testNamedStructuredOpGenericForm +@run +def testNamedStructuredOpGenericForm(): + with Context() as ctx, Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + + @func.FuncOp.from_py_func( + RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) + ) + def named_form(lhs, rhs): + init_result = tensor.empty([4, 8], f32) + # CHECK: "linalg.matmul"(%{{.*}}) + # CHECK-SAME: cast = #linalg.type_fn + # CHECK-SAME: operandSegmentSizes = array + # CHECK-NEXT: ^bb0(%{{.*}}: f32, %{{.*}}: f32, %{{.*}}: f32): + # CHECK-NEXT: arith.mulf{{.*}} (f32, f32) -> f32 + # CHECK-NEXT: arith.addf{{.*}} (f32, f32) -> f32 + # CHECK-NEXT: linalg.yield{{.*}} (f32) -> () + # CHECK-NEXT: (tensor<4x16xf32>, tensor<16x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32> + return linalg.matmul(lhs, rhs, outs=[init_result]) + + module.operation.print(print_generic_op_form=True) + + +# CHECK-LABEL: TEST: testNamedStructuredAsGenericOp +@run +def testNamedStructuredAsGenericOp(): + with Context() as ctx, Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + + @func.FuncOp.from_py_func( + RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) + ) + def generic_form(lhs, rhs): + init_result = tensor.EmptyOp([4, 8], f32) + # CHECK: linalg.generic + return linalg.matmul( + lhs, rhs, outs=[init_result.result], emit_generic=True + ) + + print(module) + + +# CHECK-LABEL: TEST: testOpResultFromOtherOp +@run +def testOpResultFromOtherOp(): + with Context(), Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + + @func.FuncOp.from_py_func( + RankedTensorType.get((4, 16), f32), RankedTensorType.get((16, 8), f32) + ) + def pass_an_op_directly(arg0, arg1): + one = arith.ConstantOp(F32Type.get(), 1.0) + # CHECK: %[[LHS:.*]] = linalg.fill + lhs = linalg.fill(one, outs=[arg0]) + # CHECK: %[[RHS:.*]] = linalg.fill + rhs = linalg.fill(one, outs=[arg1]) + # CHECK: %[[INIT:.*]] = tensor.empty + init = tensor.EmptyOp([4, 8], f32) + # CHECK: linalg.matmul + # CHECK: ins(%[[LHS]], %[[RHS]] + # CHECK: outs(%[[INIT]] + return linalg.matmul(lhs, rhs, outs=init) + + print(module) + + # CHECK-LABEL: TEST: testIdentityRegionOps @run def testIdentityRegionOps(): diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp index f820cb7ee8c3..aa5a52a21f12 100644 --- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp +++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-yaml-gen.cpp @@ -681,11 +681,7 @@ ParseResult {0}::parse(OpAsmParser &parser, OperationState &result) {{ {0}::getNumRegionArgs(), {0}::getRegionBuilder()); } void {0}::print(OpAsmPrinter &p) {{ - SmallVector elidedAttrs = {{"operandSegmentSizes", - "linalg.memoized_indexing_maps", - "indexing_maps"}; - ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), - elidedAttrs); + ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs()); } )FMT"; -- GitLab From a4402039bffd788b9af82435fd5a2fb311fdc6e8 Mon Sep 17 00:00:00 2001 From: Sebastian Kreutzer Date: Fri, 11 Oct 2024 05:23:34 -0400 Subject: [PATCH 011/345] [XRay] Add support for instrumentation of DSOs on x86_64 (#90959) This PR introduces shared library (DSO) support for XRay based on a revised version of the implementation outlined in [this RFC](https://discourse.llvm.org/t/rfc-upstreaming-dso-instrumentation-support-for-xray/73000). The feature enables the patching and handling of events from DSOs, supporting both libraries linked at startup or explicitly loaded, e.g. via `dlopen`. This patch adds the following: - The `-fxray-shared` flag to enable the feature (turned off by default) - A small runtime library that is linked into every instrumented DSO, providing position-independent trampolines and code to register with the main XRay runtime - Changes to the XRay runtime to support management and patching of multiple objects These changes are fully backward compatible, i.e. running without instrumented DSOs will produce identical traces (in terms of recorded function IDs) to the previous implementation. Due to my limited ability to test on other architectures, this feature is only implemented and tested with x86_64. Extending support to other architectures is fairly straightforward, requiring only a position-independent implementation of the architecture-specific trampoline implementation (see `compiler-rt/lib/xray/xray_trampoline_x86_64.S` for reference). This patch does not include any functionality to resolve function IDs from DSOs for the provided logging/tracing modes. These modes still work and will record calls from DSOs, but symbol resolution for these functions in not available. Getting this to work properly requires recording information about the loaded DSOs and should IMO be discussed in a separate RFC, as there are mulitple feasible approaches. @petrhosek @jplehr --- clang/include/clang/Basic/CodeGenOptions.def | 2 + clang/include/clang/Driver/Options.td | 5 + clang/include/clang/Driver/XRayArgs.h | 4 + clang/lib/Driver/ToolChains/CommonArgs.cpp | 12 +- clang/lib/Driver/XRayArgs.cpp | 21 ++ clang/test/Driver/XRay/xray-shared.cpp | 17 + .../cmake/Modules/AllSupportedArchDefs.cmake | 1 + compiler-rt/cmake/config-ix.cmake | 4 + compiler-rt/include/xray/xray_interface.h | 55 +++- compiler-rt/lib/xray/CMakeLists.txt | 86 +++++- compiler-rt/lib/xray/xray_dso_init.cpp | 62 ++++ compiler-rt/lib/xray/xray_init.cpp | 183 +++++++++-- compiler-rt/lib/xray/xray_interface.cpp | 292 ++++++++++++++---- .../lib/xray/xray_interface_internal.h | 83 ++++- compiler-rt/lib/xray/xray_trampoline_x86_64.S | 24 +- compiler-rt/lib/xray/xray_x86_64.cpp | 23 +- .../xray/TestCases/Posix/basic-mode-dso.cpp | 47 +++ .../TestCases/Posix/clang-xray-shared.cpp | 14 + .../test/xray/TestCases/Posix/dlopen.cpp | 107 +++++++ .../xray/TestCases/Posix/dso-dep-chains.cpp | 197 ++++++++++++ .../TestCases/Posix/patch-premain-dso.cpp | 45 +++ .../Posix/patching-unpatching-dso.cpp | 75 +++++ 22 files changed, 1215 insertions(+), 144 deletions(-) create mode 100644 clang/test/Driver/XRay/xray-shared.cpp create mode 100644 compiler-rt/lib/xray/xray_dso_init.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/dlopen.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp create mode 100644 compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index eac831278ee2..e45370bde74a 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -136,6 +136,8 @@ CODEGENOPT(XRayIgnoreLoops , 1, 0) ///< Emit the XRay function index section. CODEGENOPT(XRayFunctionIndex , 1, 1) +///< Set when -fxray-shared is enabled +CODEGENOPT(XRayShared , 1, 0) ///< Set the minimum number of instructions in a function to determine selective ///< XRay instrumentation. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index d306c751505e..4ee16e213d0e 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2946,6 +2946,11 @@ def fxray_selected_function_group : HelpText<"When using -fxray-function-groups, select which group of functions to instrument. Valid range is 0 to fxray-function-groups - 1">, MarshallingInfoInt, "0">; +defm xray_shared : BoolFOption<"xray-shared", + CodeGenOpts<"XRayShared">, DefaultFalse, + PosFlag, + NegFlag>; defm fine_grained_bitfield_accesses : BoolOption<"f", "fine-grained-bitfield-accesses", CodeGenOpts<"FineGrainedBitfieldAccesses">, DefaultFalse, diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h index bdd3d979547e..8fbcf469e5ba 100644 --- a/clang/include/clang/Driver/XRayArgs.h +++ b/clang/include/clang/Driver/XRayArgs.h @@ -27,6 +27,7 @@ class XRayArgs { XRayInstrSet InstrumentationBundle; llvm::opt::Arg *XRayInstrument = nullptr; bool XRayRT = true; + bool XRayShared = false; public: /// Parses the XRay arguments from an argument list. @@ -35,6 +36,9 @@ public: llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const; bool needsXRayRt() const { return XRayInstrument && XRayRT; } + bool needsXRayDSORt() const { + return XRayInstrument && XRayRT && XRayShared; + } llvm::ArrayRef modeList() const { return Modes; } XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; } }; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0c6a585c3acf..0a1b7c209563 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1613,10 +1613,14 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, } bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args, ArgStringList &CmdArgs) { - if (Args.hasArg(options::OPT_shared)) - return false; - - if (TC.getXRayArgs().needsXRayRt()) { + if (Args.hasArg(options::OPT_shared)) { + if (TC.getXRayArgs().needsXRayDSORt()) { + CmdArgs.push_back("--whole-archive"); + CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray-dso")); + CmdArgs.push_back("--no-whole-archive"); + return true; + } + } else if (TC.getXRayArgs().needsXRayRt()) { CmdArgs.push_back("--whole-archive"); CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray")); for (const auto &Mode : TC.getXRayArgs().modeList()) diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index 8c5134e25013..411054e067cb 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -63,6 +63,23 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) { << XRayInstrument->getSpelling() << Triple.str(); } + if (Args.hasFlag(options::OPT_fxray_shared, + options::OPT_fno_xray_shared, false)) { + XRayShared = true; + + // DSO instrumentation is currently limited to x86_64 + if (Triple.getArch() != llvm::Triple::x86_64) { + D.Diag(diag::err_drv_unsupported_opt_for_target) + << "-fxray-shared" << Triple.str(); + } + + unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args)); + if (!PICLvl) { + D.Diag(diag::err_opt_not_valid_without_opt) + << "-fxray-shared" << "-fPIC"; + } + } + // Both XRay and -fpatchable-function-entry use // TargetOpcode::PATCHABLE_FUNCTION_ENTER. if (Arg *A = Args.getLastArg(options::OPT_fpatchable_function_entry_EQ)) @@ -177,6 +194,10 @@ void XRayArgs::addArgs(const ToolChain &TC, const ArgList &Args, Args.addOptOutFlag(CmdArgs, options::OPT_fxray_function_index, options::OPT_fno_xray_function_index); + if (XRayShared) + Args.addOptInFlag(CmdArgs, options::OPT_fxray_shared, + options::OPT_fno_xray_shared); + if (const Arg *A = Args.getLastArg(options::OPT_fxray_instruction_threshold_EQ)) { int Value; diff --git a/clang/test/Driver/XRay/xray-shared.cpp b/clang/test/Driver/XRay/xray-shared.cpp new file mode 100644 index 000000000000..215854e1fc7c --- /dev/null +++ b/clang/test/Driver/XRay/xray-shared.cpp @@ -0,0 +1,17 @@ +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fpic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s +// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-PIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC +// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-pic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC + +// On 64 bit darwin, PIC is always enabled +// RUN: %clang -### --target=x86_64-apple-darwin -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s + +// Check unsupported targets +// RUN: not %clang -### --target=aarch64-pc-freebsd -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET +// RUN: not %clang -### --target=arm64-apple-macos -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET + +// CHECK: "-cc1" {{.*}}"-fxray-instrument" {{.*}}"-fxray-shared" +// ERR-TARGET: error: unsupported option '-fxray-shared' for target +// ERR-PIC: error: option '-fxray-shared' cannot be specified without '-fPIC' + diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake index 809e92771569..50a4256b82fe 100644 --- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake +++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake @@ -104,6 +104,7 @@ else() set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} powerpc64le ${HEXAGON} ${LOONGARCH64}) endif() +set(ALL_XRAY_DSO_SUPPORTED_ARCH ${X86_64}) set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64}) if (UNIX) diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index a93a88a92050..6134c9876b38 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -668,6 +668,9 @@ if(APPLE) list_intersect(XRAY_SUPPORTED_ARCH ALL_XRAY_SUPPORTED_ARCH SANITIZER_COMMON_SUPPORTED_ARCH) + list_intersect(XRAY_DSO_SUPPORTED_ARCH + ALL_XRAY_DSO_SUPPORTED_ARCH + SANITIZER_COMMON_SUPPORTED_ARCH) list_intersect(SHADOWCALLSTACK_SUPPORTED_ARCH ALL_SHADOWCALLSTACK_SUPPORTED_ARCH SANITIZER_COMMON_SUPPORTED_ARCH) @@ -702,6 +705,7 @@ else() filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH}) filter_available_targets(SCUDO_STANDALONE_SUPPORTED_ARCH ${ALL_SCUDO_STANDALONE_SUPPORTED_ARCH}) filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH}) + filter_available_targets(XRAY_DSO_SUPPORTED_ARCH ${ALL_XRAY_DSO_SUPPORTED_ARCH}) filter_available_targets(SHADOWCALLSTACK_SUPPORTED_ARCH ${ALL_SHADOWCALLSTACK_SUPPORTED_ARCH}) filter_available_targets(GWP_ASAN_SUPPORTED_ARCH ${ALL_GWP_ASAN_SUPPORTED_ARCH}) diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h index 727431c04e4f..717cfe292ce4 100644 --- a/compiler-rt/include/xray/xray_interface.h +++ b/compiler-rt/include/xray/xray_interface.h @@ -93,31 +93,74 @@ enum XRayPatchingStatus { FAILED = 3, }; -/// This tells XRay to patch the instrumentation points. See XRayPatchingStatus +/// This tells XRay to patch the instrumentation points in all currently loaded objects. See XRayPatchingStatus /// for possible result values. extern XRayPatchingStatus __xray_patch(); +/// This tells XRay to patch the instrumentation points in the given object. +/// See XRayPatchingStatus for possible result values. +extern XRayPatchingStatus __xray_patch_object(int32_t ObjId); + /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_unpatch(); -/// This patches a specific function id. See XRayPatchingStatus for possible +/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for possible +/// result values. +extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); + +/// This unpacks the given (packed) function id and patches +/// the corresponding function. See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_patch_function(int32_t FuncId); -/// This unpatches a specific function id. See XRayPatchingStatus for possible +/// This patches a specific function in the given object. See XRayPatchingStatus for possible +/// result values. +extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId, + int32_t ObjId); + +/// This unpacks the given (packed) function id and unpatches +/// the corresponding function. See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId); -/// This function returns the address of the function provided a valid function -/// id. We return 0 if we encounter any error, even if 0 may be a valid function +/// This unpatches a specific function in the given object. +/// See XRayPatchingStatus for possible result values. +extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId, + int32_t ObjId); + +/// This function unpacks the given (packed) function id and returns the address of the corresponding function. We return 0 if we encounter any error, even if 0 may be a valid function /// address. extern uintptr_t __xray_function_address(int32_t FuncId); -/// This function returns the maximum valid function id. Returns 0 if we +/// This function returns the address of the function in the given object provided valid function and object +/// ids. We return 0 if we encounter any error, even if 0 may be a valid function +/// address. +extern uintptr_t __xray_function_address_in_object(int32_t FuncId, + int32_t ObjId); + +/// This function returns the maximum valid function id for the main executable (object id = 0). Returns 0 if we /// encounter errors (when there are no instrumented functions, etc.). extern size_t __xray_max_function_id(); +/// This function returns the maximum valid function id for the given object. Returns 0 if we +/// encounter errors (when there are no instrumented functions, etc.). +extern size_t __xray_max_function_id_in_object(int32_t ObjId); + +/// This function returns the number of previously registered objects (executable + loaded DSOs). +/// Returns 0 if XRay has not been initialized. +extern size_t __xray_num_objects(); + +/// Unpacks the function id from the given packed id. +extern int32_t __xray_unpack_function_id(int32_t PackedId); + +/// Unpacks the object id from the given packed id. +extern int32_t __xray_unpack_object_id(int32_t PackedId); + +/// Creates and returns a packed id from the given function and object ids. +/// If the ids do not fit within the reserved number of bits for each part, the high bits are truncated. +extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId); + /// Initialize the required XRay data structures. This is useful in cases where /// users want to control precisely when the XRay instrumentation data /// structures are initialized, for example when the XRay library is built with diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt index cf7b5062aae3..f38c07420c9a 100644 --- a/compiler-rt/lib/xray/CMakeLists.txt +++ b/compiler-rt/lib/xray/CMakeLists.txt @@ -10,6 +10,10 @@ set(XRAY_SOURCES xray_utils.cpp ) +set(XRAY_DSO_SOURCES + xray_dso_init.cpp + ) + # Implementation files for all XRay modes. set(XRAY_FDR_MODE_SOURCES xray_fdr_flags.cpp @@ -33,6 +37,11 @@ set(x86_64_SOURCES xray_trampoline_x86_64.S ) +set(x86_64_DSO_SOURCES + xray_trampoline_x86_64.S + ) + + set(arm_SOURCES xray_arm.cpp xray_trampoline_arm.S @@ -128,10 +137,12 @@ set(XRAY_IMPL_HEADERS # consumption by tests. set(XRAY_ALL_SOURCE_FILES ${XRAY_SOURCES} + ${XRAY_DSO_SOURCES} ${XRAY_FDR_MODE_SOURCES} ${XRAY_BASIC_MODE_SOURCES} ${XRAY_PROFILING_MODE_SOURCES} ${x86_64_SOURCES} + ${x86_64_DSO_SOURCES} ${arm_SOURCES} ${armhf_SOURCES} ${hexagon_SOURCES} @@ -162,6 +173,9 @@ set(XRAY_CFLAGS ${COMPILER_RT_CXX_CFLAGS}) set(XRAY_COMMON_DEFINITIONS SANITIZER_COMMON_NO_REDEFINE_BUILTINS XRAY_HAS_EXCEPTIONS=1) +# DSO trampolines need to be compiled with GOT addressing +set(XRAY_COMMON_DEFINITIONS_DSO ${XRAY_COMMON_DEFINITIONS} XRAY_PIC) + # Too many existing bugs, needs cleanup. append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format XRAY_CFLAGS) @@ -201,7 +215,16 @@ if (APPLE) CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS} DEPS ${XRAY_DEPS}) + add_compiler_rt_object_libraries(RTXrayDSO + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_DSO_SUPPORTED_ARCH} + SOURCES ${XRAY_DSO_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS_DSO} + DEPS ${XRAY_DEPS}) set(XRAY_RTXRAY_ARCH_LIBS "") + set(XRAY_DSO_RTXRAY_ARCH_LIBS "") foreach(arch ${XRAY_SUPPORTED_ARCH}) if(NOT ${arch} IN_LIST XRAY_SOURCE_ARCHS) continue() @@ -215,6 +238,17 @@ if (APPLE) DEFS ${XRAY_COMMON_DEFINITIONS} DEPS ${XRAY_DEPS}) list(APPEND XRAY_RTXRAY_ARCH_LIBS RTXray_${arch}) + if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH) + add_compiler_rt_object_libraries(RTXrayDSO_${arch} + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_DSO_SUPPORTED_ARCH} + SOURCES ${${arch}_DSO_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS_DSO} + DEPS ${XRAY_DEPS}) + list(APPEND XRAY_DSO_RTXRAY_ARCH_LIBS RTXrayDSO_${arch}) + endif() endforeach() add_compiler_rt_object_libraries(RTXrayFDR OS ${XRAY_SUPPORTED_OS} @@ -252,6 +286,17 @@ if (APPLE) LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} LINK_LIBS ${XRAY_LINK_LIBS} PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-dso + STATIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_DSO_SUPPORTED_ARCH} + OBJECT_LIBS RTXrayDSO ${XRAY_DSO_RTXRAY_ARCH_LIBS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + PARENT_TARGET xray) + add_compiler_rt_runtime(clang_rt.xray-fdr STATIC OS ${XRAY_SUPPORTED_OS} @@ -346,16 +391,37 @@ else() # not Apple DEFS ${XRAY_COMMON_DEFINITIONS} OBJECT_LIBS RTXrayBASIC PARENT_TARGET xray) - # Profiler Mode runtime - add_compiler_rt_runtime(clang_rt.xray-profiling - STATIC - ARCHS ${arch} - CFLAGS ${XRAY_CFLAGS} - LINK_FLAGS ${XRAY_LINK_FLAGS} - LINK_LIBS ${XRAY_LINK_LIBS} - DEFS ${XRAY_COMMON_DEFINITIONS} - OBJECT_LIBS RTXrayPROFILING - PARENT_TARGET xray) + # Profiler Mode runtime + add_compiler_rt_runtime(clang_rt.xray-profiling + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + LINK_FLAGS ${XRAY_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayPROFILING + PARENT_TARGET xray) + + if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH) + # TODO: Only implemented for X86 at the moment + add_compiler_rt_object_libraries(RTXrayDSO + ARCHS ${arch} + SOURCES ${XRAY_DSO_SOURCES} ${${arch}_DSO_SOURCES} + ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS_DSO} + DEPS ${XRAY_DEPS}) + # DSO runtime archive + add_compiler_rt_runtime(clang_rt.xray-dso + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + LINK_FLAGS ${XRAY_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayDSO + PARENT_TARGET xray) + endif() endforeach() endif() # not Apple diff --git a/compiler-rt/lib/xray/xray_dso_init.cpp b/compiler-rt/lib/xray/xray_dso_init.cpp new file mode 100644 index 000000000000..eb754db54c64 --- /dev/null +++ b/compiler-rt/lib/xray/xray_dso_init.cpp @@ -0,0 +1,62 @@ +//===-- xray_init.cpp -------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// XRay initialisation logic for DSOs. +//===----------------------------------------------------------------------===// + +#include "sanitizer_common/sanitizer_atomic.h" +#include "xray_defs.h" +#include "xray_flags.h" +#include "xray_interface_internal.h" + +using namespace __sanitizer; + +extern "C" { +extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak)) +__attribute__((visibility("hidden"))); +extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak)) +__attribute__((visibility("hidden"))); +extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)) +__attribute__((visibility("hidden"))); +extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)) +__attribute__((visibility("hidden"))); + +#if SANITIZER_APPLE +// HACK: This is a temporary workaround to make XRay build on +// Darwin, but it will probably not work at runtime. +extern const XRaySledEntry __start_xray_instr_map[] = {}; +extern const XRaySledEntry __stop_xray_instr_map[] = {}; +extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {}; +extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {}; +#endif +} + +// Handler functions to call in the patched entry/exit sled. +extern atomic_uintptr_t XRayPatchedFunction; +extern atomic_uintptr_t XRayArgLogger; +extern atomic_uintptr_t XRayPatchedCustomEvent; +extern atomic_uintptr_t XRayPatchedTypedEvent; + +static int __xray_object_id{-1}; + +// Note: .preinit_array initialization does not work for DSOs +__attribute__((constructor(0))) static void +__xray_init_dso() XRAY_NEVER_INSTRUMENT { + // Register sleds in main XRay runtime. + __xray_object_id = + __xray_register_dso(__start_xray_instr_map, __stop_xray_instr_map, + __start_xray_fn_idx, __stop_xray_fn_idx, {}); +} + +__attribute__((destructor(0))) static void +__xray_finalize_dso() XRAY_NEVER_INSTRUMENT { + // Inform the main runtime that this DSO is no longer used. + __xray_deregister_dso(__xray_object_id); +} diff --git a/compiler-rt/lib/xray/xray_init.cpp b/compiler-rt/lib/xray/xray_init.cpp index f22a31b95686..53c93be89cd1 100644 --- a/compiler-rt/lib/xray/xray_init.cpp +++ b/compiler-rt/lib/xray/xray_init.cpp @@ -16,6 +16,8 @@ #include #include "sanitizer_common/sanitizer_common.h" +#include "xray/xray_interface.h" +#include "xray_allocator.h" #include "xray_defs.h" #include "xray_flags.h" #include "xray_interface_internal.h" @@ -28,7 +30,7 @@ extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)); extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)); #if SANITIZER_APPLE -// HACK: This is a temporary workaround to make XRay build on +// HACK: This is a temporary workaround to make XRay build on // Darwin, but it will probably not work at runtime. const XRaySledEntry __start_xray_instr_map[] = {}; extern const XRaySledEntry __stop_xray_instr_map[] = {}; @@ -43,14 +45,16 @@ using namespace __xray; // the weak symbols defined above (__start_xray_inst_map and // __stop_xray_instr_map) to initialise the instrumentation map that XRay uses // for runtime patching/unpatching of instrumentation points. -// -// FIXME: Support DSO instrumentation maps too. The current solution only works -// for statically linked executables. atomic_uint8_t XRayInitialized{0}; // This should always be updated before XRayInitialized is updated. SpinMutex XRayInstrMapMutex; -XRaySledMap XRayInstrMap; + +// Contains maps for the main executable as well as DSOs. +XRaySledMap *XRayInstrMaps; + +// Number of binary objects registered. +atomic_uint32_t XRayNumObjects{0}; // Global flag to determine whether the flags have been initialized. atomic_uint8_t XRayFlagsInitialized{0}; @@ -58,6 +62,63 @@ atomic_uint8_t XRayFlagsInitialized{0}; // A mutex to allow only one thread to initialize the XRay data structures. SpinMutex XRayInitMutex; +// Registers XRay sleds and trampolines coming from the main executable or one +// of the linked DSOs. +// Returns the object ID if registration is successful, -1 otherwise. +int32_t +__xray_register_sleds(const XRaySledEntry *SledsBegin, + const XRaySledEntry *SledsEnd, + const XRayFunctionSledIndex *FnIndexBegin, + const XRayFunctionSledIndex *FnIndexEnd, bool FromDSO, + XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT { + if (!SledsBegin || !SledsEnd) { + Report("Invalid XRay sleds.\n"); + return -1; + } + XRaySledMap SledMap; + SledMap.FromDSO = FromDSO; + SledMap.Loaded = true; + SledMap.Trampolines = Trampolines; + SledMap.Sleds = SledsBegin; + SledMap.Entries = SledsEnd - SledsBegin; + if (FnIndexBegin != nullptr) { + SledMap.SledsIndex = FnIndexBegin; + SledMap.Functions = FnIndexEnd - FnIndexBegin; + } else { + size_t CountFunctions = 0; + uint64_t LastFnAddr = 0; + + for (std::size_t I = 0; I < SledMap.Entries; I++) { + const auto &Sled = SledMap.Sleds[I]; + const auto Function = Sled.function(); + if (Function != LastFnAddr) { + CountFunctions++; + LastFnAddr = Function; + } + } + SledMap.SledsIndex = nullptr; + SledMap.Functions = CountFunctions; + } + if (SledMap.Functions >= XRayMaxFunctions) { + Report("Too many functions! Maximum is %ld\n", XRayMaxFunctions); + return -1; + } + + if (Verbosity()) + Report("Registering %d new functions!\n", SledMap.Functions); + + { + SpinMutexLock Guard(&XRayInstrMapMutex); + auto Idx = atomic_fetch_add(&XRayNumObjects, 1, memory_order_acq_rel); + if (Idx >= XRayMaxObjects) { + Report("Too many objects registered! Maximum is %ld\n", XRayMaxObjects); + return -1; + } + XRayInstrMaps[Idx] = std::move(SledMap); + return Idx; + } +} + // __xray_init() will do the actual loading of the current process' memory map // and then proceed to look for the .xray_instr_map section/segment. void __xray_init() XRAY_NEVER_INSTRUMENT { @@ -80,29 +141,21 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { return; } - { - SpinMutexLock Guard(&XRayInstrMapMutex); - XRayInstrMap.Sleds = __start_xray_instr_map; - XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map; - if (__start_xray_fn_idx != nullptr) { - XRayInstrMap.SledsIndex = __start_xray_fn_idx; - XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx; - } else { - size_t CountFunctions = 0; - uint64_t LastFnAddr = 0; - - for (std::size_t I = 0; I < XRayInstrMap.Entries; I++) { - const auto &Sled = XRayInstrMap.Sleds[I]; - const auto Function = Sled.function(); - if (Function != LastFnAddr) { - CountFunctions++; - LastFnAddr = Function; - } - } + atomic_store(&XRayNumObjects, 0, memory_order_release); - XRayInstrMap.Functions = CountFunctions; - } + // Pre-allocation takes up approx. 5kB for XRayMaxObjects=64. + XRayInstrMaps = allocateBuffer(XRayMaxObjects); + + int MainBinaryId = + __xray_register_sleds(__start_xray_instr_map, __stop_xray_instr_map, + __start_xray_fn_idx, __stop_xray_fn_idx, false, {}); + + // The executable should always get ID 0. + if (MainBinaryId != 0) { + Report("Registering XRay sleds failed.\n"); + return; } + atomic_store(&XRayInitialized, true, memory_order_release); #ifndef XRAY_NO_PREINIT @@ -111,6 +164,84 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { #endif } +// Registers XRay sleds and trampolines of an instrumented DSO. +// Returns the object ID if registration is successful, -1 otherwise. +// +// Default visibility is hidden, so we have to explicitly make it visible to +// DSO. +SANITIZER_INTERFACE_ATTRIBUTE int32_t __xray_register_dso( + const XRaySledEntry *SledsBegin, const XRaySledEntry *SledsEnd, + const XRayFunctionSledIndex *FnIndexBegin, + const XRayFunctionSledIndex *FnIndexEnd, + XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT { + // Make sure XRay has been initialized in the main executable. + __xray_init(); + + if (__xray_num_objects() == 0) { + if (Verbosity()) + Report("No XRay instrumentation map in main executable. Not initializing " + "XRay for DSO.\n"); + return -1; + } + + // Register sleds in global map. + int ObjId = __xray_register_sleds(SledsBegin, SledsEnd, FnIndexBegin, + FnIndexEnd, true, Trampolines); + +#ifndef XRAY_NO_PREINIT + if (ObjId >= 0 && flags()->patch_premain) + __xray_patch_object(ObjId); +#endif + + return ObjId; +} + +// Deregisters a DSO from the main XRay runtime. +// Called from the DSO-local runtime when the library is unloaded (e.g. if +// dlclose is called). +// Returns true if the object ID is valid and the DSO was successfully +// deregistered. +SANITIZER_INTERFACE_ATTRIBUTE bool +__xray_deregister_dso(int32_t ObjId) XRAY_NEVER_INSTRUMENT { + + if (!atomic_load(&XRayInitialized, memory_order_acquire)) { + if (Verbosity()) + Report("XRay has not been initialized. Cannot deregister DSO.\n"); + return false; + } + + if (ObjId <= 0 || ObjId >= __xray_num_objects()) { + if (Verbosity()) + Report("Can't deregister object with ID %d: ID is invalid.\n", ObjId); + return false; + } + + { + SpinMutexLock Guard(&XRayInstrMapMutex); + auto &Entry = XRayInstrMaps[ObjId]; + if (!Entry.FromDSO) { + if (Verbosity()) + Report("Can't deregister object with ID %d: object does not correspond " + "to a shared library.\n", + ObjId); + return false; + } + if (!Entry.Loaded) { + if (Verbosity()) + Report("Can't deregister object with ID %d: object is not loaded.\n", + ObjId); + return true; + } + // Mark DSO as unloaded. No need to unpatch. + Entry.Loaded = false; + } + + if (Verbosity()) + Report("Deregistered object with ID %d.\n", ObjId); + + return true; +} + // FIXME: Make check-xray tests work on FreeBSD without // SANITIZER_CAN_USE_PREINIT_ARRAY. // See sanitizer_internal_defs.h where the macro is defined. diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 5839043fcb93..16e60bfc22cd 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -13,6 +13,7 @@ //===----------------------------------------------------------------------===// #include "xray_interface_internal.h" +#include "llvm/Support/ErrorHandling.h" #include #include @@ -36,7 +37,8 @@ extern __sanitizer::SpinMutex XRayInstrMapMutex; extern __sanitizer::atomic_uint8_t XRayInitialized; -extern __xray::XRaySledMap XRayInstrMap; +extern __xray::XRaySledMap *XRayInstrMaps; +extern __sanitizer::atomic_uint32_t XRayNumObjects; namespace __xray { @@ -61,16 +63,16 @@ static const int16_t cSledLength = 20; #endif /* CPU architecture */ // This is the function to call when we encounter the entry or exit sleds. -atomic_uintptr_t XRayPatchedFunction{0}; +atomic_uintptr_t XRayPatchedFunction SANITIZER_INTERFACE_ATTRIBUTE{0}; // This is the function to call from the arg1-enabled sleds/trampolines. -atomic_uintptr_t XRayArgLogger{0}; +atomic_uintptr_t XRayArgLogger SANITIZER_INTERFACE_ATTRIBUTE{0}; // This is the function to call when we encounter a custom event log call. -atomic_uintptr_t XRayPatchedCustomEvent{0}; +atomic_uintptr_t XRayPatchedCustomEvent SANITIZER_INTERFACE_ATTRIBUTE{0}; // This is the function to call when we encounter a typed event log call. -atomic_uintptr_t XRayPatchedTypedEvent{0}; +atomic_uintptr_t XRayPatchedTypedEvent SANITIZER_INTERFACE_ATTRIBUTE{0}; // This is the global status to determine whether we are currently // patching/unpatching. @@ -150,27 +152,42 @@ public: namespace { -bool patchSled(const XRaySledEntry &Sled, bool Enable, - int32_t FuncId) XRAY_NEVER_INSTRUMENT { +bool isObjectLoaded(int32_t ObjId) { + SpinMutexLock Guard(&XRayInstrMapMutex); + if (ObjId < 0 || + ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { + return false; + } + return XRayInstrMaps[ObjId].Loaded; +} + +bool patchSled(const XRaySledEntry &Sled, bool Enable, int32_t FuncId, + const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT { bool Success = false; switch (Sled.Kind) { case XRayEntryType::ENTRY: - Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry); + Success = + patchFunctionEntry(Enable, FuncId, Sled, Trampolines.EntryTrampoline); break; case XRayEntryType::EXIT: - Success = patchFunctionExit(Enable, FuncId, Sled); + Success = + patchFunctionExit(Enable, FuncId, Sled, Trampolines.ExitTrampoline); break; case XRayEntryType::TAIL: - Success = patchFunctionTailExit(Enable, FuncId, Sled); + Success = patchFunctionTailExit(Enable, FuncId, Sled, + Trampolines.TailExitTrampoline); break; case XRayEntryType::LOG_ARGS_ENTRY: - Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry); + Success = + patchFunctionEntry(Enable, FuncId, Sled, Trampolines.LogArgsTrampoline); break; case XRayEntryType::CUSTOM_EVENT: - Success = patchCustomEvent(Enable, FuncId, Sled); + Success = patchCustomEvent(Enable, FuncId, Sled, + Trampolines.CustomEventTrampoline); break; case XRayEntryType::TYPED_EVENT: - Success = patchTypedEvent(Enable, FuncId, Sled); + Success = + patchTypedEvent(Enable, FuncId, Sled, Trampolines.TypedEventTrampoline); break; default: Report("Unsupported sled kind '%" PRIu64 "' @%04x\n", Sled.Address, @@ -205,10 +222,9 @@ findFunctionSleds(int32_t FuncId, return Index; } -XRayPatchingStatus patchFunction(int32_t FuncId, +XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, bool Enable) XRAY_NEVER_INSTRUMENT { - if (!atomic_load(&XRayInitialized, - memory_order_acquire)) + if (!atomic_load(&XRayInitialized, memory_order_acquire)) return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. uint8_t NotPatching = false; @@ -220,13 +236,24 @@ XRayPatchingStatus patchFunction(int32_t FuncId, XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - InstrMap = XRayInstrMap; + if (ObjId < 0 || + ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { + Report("Unable to patch function: invalid sled map index: %d", ObjId); + return XRayPatchingStatus::FAILED; + } + InstrMap = XRayInstrMaps[ObjId]; } // If we don't have an index, we can't patch individual functions. if (InstrMap.Functions == 0) return XRayPatchingStatus::NOT_INITIALIZED; + // Check if the corresponding DSO has been unloaded. + if (!InstrMap.Loaded) { + Report("Invalid function id provided: %d\n", FuncId); + return XRayPatchingStatus::NOT_INITIALIZED; + } + // FuncId must be a positive number, less than the number of functions // instrumented. if (FuncId <= 0 || static_cast(FuncId) > InstrMap.Functions) { @@ -234,6 +261,8 @@ XRayPatchingStatus patchFunction(int32_t FuncId, return XRayPatchingStatus::FAILED; } + auto PackedId = __xray::MakePackedId(FuncId, ObjId); + // Now we patch ths sleds for this specific function. XRayFunctionSledIndex SledRange; if (InstrMap.SledsIndex) { @@ -242,13 +271,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId, } else { SledRange = findFunctionSleds(FuncId, InstrMap); } + auto *f = SledRange.Begin; bool SucceedOnce = false; for (size_t i = 0; i != SledRange.Size; ++i) - SucceedOnce |= patchSled(f[i], Enable, FuncId); + SucceedOnce |= patchSled(f[i], Enable, PackedId, InstrMap.Trampolines); - atomic_store(&XRayPatching, false, - memory_order_release); + atomic_store(&XRayPatching, false, memory_order_release); if (!SucceedOnce) { Report("Failed patching any sled for function '%d'.", FuncId); @@ -261,32 +290,31 @@ XRayPatchingStatus patchFunction(int32_t FuncId, // controlPatching implements the common internals of the patching/unpatching // implementation. |Enable| defines whether we're enabling or disabling the // runtime XRay instrumentation. -XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { - if (!atomic_load(&XRayInitialized, - memory_order_acquire)) - return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. - - uint8_t NotPatching = false; - if (!atomic_compare_exchange_strong( - &XRayPatching, &NotPatching, true, memory_order_acq_rel)) - return XRayPatchingStatus::ONGOING; // Already patching. - - uint8_t PatchingSuccess = false; - auto XRayPatchingStatusResetter = - at_scope_exit([&PatchingSuccess] { - if (!PatchingSuccess) - atomic_store(&XRayPatching, false, - memory_order_release); - }); - +// This function should only be called after ensuring that XRay is initialized +// and no other thread is currently patching. +XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - InstrMap = XRayInstrMap; + if (ObjId < 0 || + ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { + Report("Unable to patch functions: invalid sled map index: %d\n", ObjId); + return XRayPatchingStatus::FAILED; + } + InstrMap = XRayInstrMaps[ObjId]; } if (InstrMap.Entries == 0) return XRayPatchingStatus::NOT_INITIALIZED; + if (Verbosity()) + Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries); + + // Check if the corresponding DSO has been unloaded. + if (!InstrMap.Loaded) { + Report("Object is not loaded at index: %d\n", ObjId); + return XRayPatchingStatus::FAILED; + } + uint32_t FuncId = 1; uint64_t CurFun = 0; @@ -336,20 +364,96 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { ++FuncId; CurFun = F; } - patchSled(Sled, Enable, FuncId); + auto PackedId = __xray::MakePackedId(FuncId, ObjId); + patchSled(Sled, Enable, PackedId, InstrMap.Trampolines); } - atomic_store(&XRayPatching, false, - memory_order_release); - PatchingSuccess = true; + atomic_store(&XRayPatching, false, memory_order_release); return XRayPatchingStatus::SUCCESS; } -XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, +// Controls patching for all registered objects. +// Returns: SUCCESS, if patching succeeds for all objects. +// NOT_INITIALIZED, if one or more objects returned NOT_INITIALIZED +// but none failed. +// FAILED, if patching of one or more objects failed. +XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { + if (!atomic_load(&XRayInitialized, memory_order_acquire)) + return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. + + uint8_t NotPatching = false; + if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true, + memory_order_acq_rel)) + return XRayPatchingStatus::ONGOING; // Already patching. + + auto XRayPatchingStatusResetter = at_scope_exit( + [] { atomic_store(&XRayPatching, false, memory_order_release); }); + + unsigned NumObjects = __xray_num_objects(); + + XRayPatchingStatus CombinedStatus{NOT_INITIALIZED}; + for (unsigned I = 0; I < NumObjects; ++I) { + if (!isObjectLoaded(I)) + continue; + auto LastStatus = controlPatchingObjectUnchecked(Enable, I); + switch (LastStatus) { + case SUCCESS: + if (CombinedStatus == NOT_INITIALIZED) + CombinedStatus = SUCCESS; + break; + case FAILED: + // Report failure, but try to patch the remaining objects + CombinedStatus = FAILED; + break; + case NOT_INITIALIZED: + // XRay has been initialized but there are no sleds available for this + // object. Try to patch remaining objects. + if (CombinedStatus != FAILED) + CombinedStatus = NOT_INITIALIZED; + break; + case ONGOING: + llvm_unreachable("Status ONGOING should not appear at this point"); + default: + llvm_unreachable("Unhandled patching status"); + } + } + return CombinedStatus; +} + +// Controls patching for one object. +XRayPatchingStatus controlPatching(bool Enable, + int32_t ObjId) XRAY_NEVER_INSTRUMENT { + + if (!atomic_load(&XRayInitialized, memory_order_acquire)) + return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. + + uint8_t NotPatching = false; + if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true, + memory_order_acq_rel)) + return XRayPatchingStatus::ONGOING; // Already patching. + + auto XRayPatchingStatusResetter = at_scope_exit( + [] { atomic_store(&XRayPatching, false, memory_order_release); }); + + return controlPatchingObjectUnchecked(Enable, ObjId); +} + +XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId, bool Enable) XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - InstrMap = XRayInstrMap; + if (ObjId < 0 || + ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { + Report("Unable to patch function: invalid sled map index: %d\n", ObjId); + return XRayPatchingStatus::FAILED; + } + InstrMap = XRayInstrMaps[ObjId]; + } + + // Check if the corresponding DSO has been unloaded. + if (!InstrMap.Loaded) { + Report("Object is not loaded at index: %d\n", ObjId); + return XRayPatchingStatus::FAILED; } // FuncId must be a positive number, less than the number of functions @@ -398,7 +502,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, Report("Failed mprotect: %d\n", errno); return XRayPatchingStatus::FAILED; } - return patchFunction(FuncId, Enable); + return patchFunction(FuncId, ObjId, Enable); } } // namespace @@ -412,12 +516,10 @@ using namespace __xray; int __xray_set_handler(void (*entry)(int32_t, XRayEntryType)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, - memory_order_acquire)) { + if (atomic_load(&XRayInitialized, memory_order_acquire)) { atomic_store(&__xray::XRayPatchedFunction, - reinterpret_cast(entry), - memory_order_release); + reinterpret_cast(entry), memory_order_release); return 1; } return 0; @@ -425,11 +527,9 @@ int __xray_set_handler(void (*entry)(int32_t, int __xray_set_customevent_handler(void (*entry)(void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, - memory_order_acquire)) { + if (atomic_load(&XRayInitialized, memory_order_acquire)) { atomic_store(&__xray::XRayPatchedCustomEvent, - reinterpret_cast(entry), - memory_order_release); + reinterpret_cast(entry), memory_order_release); return 1; } return 0; @@ -437,11 +537,9 @@ int __xray_set_customevent_handler(void (*entry)(void *, size_t)) int __xray_set_typedevent_handler(void (*entry)(size_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, - memory_order_acquire)) { + if (atomic_load(&XRayInitialized, memory_order_acquire)) { atomic_store(&__xray::XRayPatchedTypedEvent, - reinterpret_cast(entry), - memory_order_release); + reinterpret_cast(entry), memory_order_release); return 1; } return 0; @@ -474,39 +572,78 @@ XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT { return controlPatching(true); } +XRayPatchingStatus __xray_patch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { + return controlPatching(true, ObjId); +} + XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT { return controlPatching(false); } +XRayPatchingStatus __xray_unpatch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { + return controlPatching(false, ObjId); +} + XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - return mprotectAndPatchFunction(FuncId, true); + auto Ids = __xray::UnpackId(FuncId); + auto ObjId = Ids.first; + auto FnId = Ids.second; + return mprotectAndPatchFunction(FnId, ObjId, true); +} + +XRayPatchingStatus +__xray_patch_function_in_object(int32_t FuncId, + int32_t ObjId) XRAY_NEVER_INSTRUMENT { + return mprotectAndPatchFunction(FuncId, ObjId, true); } XRayPatchingStatus __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - return mprotectAndPatchFunction(FuncId, false); + auto Ids = __xray::UnpackId(FuncId); + auto ObjId = Ids.first; + auto FnId = Ids.second; + return mprotectAndPatchFunction(FnId, ObjId, false); +} + +XRayPatchingStatus +__xray_unpatch_function_in_object(int32_t FuncId, + int32_t ObjId) XRAY_NEVER_INSTRUMENT { + return mprotectAndPatchFunction(FuncId, ObjId, false); } int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) { - if (!atomic_load(&XRayInitialized, - memory_order_acquire)) + if (!atomic_load(&XRayInitialized, memory_order_acquire)) return 0; // A relaxed write might not be visible even if the current thread gets // scheduled on a different CPU/NUMA node. We need to wait for everyone to // have this handler installed for consistency of collected data across CPUs. atomic_store(&XRayArgLogger, reinterpret_cast(entry), - memory_order_release); + memory_order_release); return 1; } int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); } -uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { +uintptr_t +__xray_function_address(int32_t CombinedFuncId) XRAY_NEVER_INSTRUMENT { + auto Ids = __xray::UnpackId(CombinedFuncId); + return __xray_function_address_in_object(Ids.second, Ids.first); +} + +uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId) + XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - InstrMap = XRayInstrMap; + auto count = atomic_load(&XRayNumObjects, memory_order_acquire); + if (ObjId < 0 || ObjId >= count) { + Report("Unable to determine function address: invalid sled map index %d " + "(size is %d)\n", + ObjId, (int)count); + return 0; + } + InstrMap = XRayInstrMaps[ObjId]; } if (FuncId <= 0 || static_cast(FuncId) > InstrMap.Functions) @@ -525,6 +662,29 @@ uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { } size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT { + return __xray_max_function_id_in_object(0); +} + +size_t __xray_max_function_id_in_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { + SpinMutexLock Guard(&XRayInstrMapMutex); + if (ObjId < 0 || ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) + return 0; + return XRayInstrMaps[ObjId].Functions; +} + +size_t __xray_num_objects() XRAY_NEVER_INSTRUMENT { SpinMutexLock Guard(&XRayInstrMapMutex); - return XRayInstrMap.Functions; + return atomic_load(&XRayNumObjects, memory_order_acquire); +} + +int32_t __xray_unpack_function_id(int32_t PackedId) { + return __xray::UnpackId(PackedId).second; +} + +int32_t __xray_unpack_object_id(int32_t PackedId) { + return __xray::UnpackId(PackedId).first; +} + +int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId) { + return __xray::MakePackedId(FuncId, ObjId); } diff --git a/compiler-rt/lib/xray/xray_interface_internal.h b/compiler-rt/lib/xray/xray_interface_internal.h index 80c07c167f64..5fbaa9c3f315 100644 --- a/compiler-rt/lib/xray/xray_interface_internal.h +++ b/compiler-rt/lib/xray/xray_interface_internal.h @@ -18,6 +18,18 @@ #include "xray/xray_interface.h" #include #include +#include + +extern "C" { +// The following functions have to be defined in assembler, on a per-platform +// basis. See xray_trampoline_*.S files for implementations. +extern void __xray_FunctionEntry(); +extern void __xray_FunctionExit(); +extern void __xray_FunctionTailExit(); +extern void __xray_ArgLoggerEntry(); +extern void __xray_CustomEvent(); +extern void __xray_TypedEvent(); +} extern "C" { @@ -67,36 +79,77 @@ struct XRayFunctionSledIndex { uintptr_t(Begin)); } }; + +struct XRayTrampolines { + void (*EntryTrampoline)(); + void (*ExitTrampoline)(); + void (*TailExitTrampoline)(); + void (*LogArgsTrampoline)(); + void (*CustomEventTrampoline)(); + void (*TypedEventTrampoline)(); + + XRayTrampolines() { + // These resolve to the definitions in the respective executable or DSO. + EntryTrampoline = __xray_FunctionEntry; + ExitTrampoline = __xray_FunctionExit; + TailExitTrampoline = __xray_FunctionTailExit; + LogArgsTrampoline = __xray_ArgLoggerEntry; + CustomEventTrampoline = __xray_CustomEvent; + TypedEventTrampoline = __xray_TypedEvent; + } +}; + +extern int32_t __xray_register_dso(const XRaySledEntry *SledsBegin, + const XRaySledEntry *SledsEnd, + const XRayFunctionSledIndex *FnIndexBegin, + const XRayFunctionSledIndex *FnIndexEnd, + XRayTrampolines Trampolines); + +extern bool __xray_deregister_dso(int32_t ObjId); } namespace __xray { +constexpr uint32_t XRayNFnBits = 24; +constexpr uint32_t XRayNObjBits = 8; + +constexpr uint32_t XRayFnBitMask = 0x00FFFFFF; +constexpr uint32_t XRayObjBitMask = 0xFF000000; + +constexpr size_t XRayMaxFunctions = 1 << XRayNFnBits; +constexpr size_t XRayMaxObjects = 1 << XRayNObjBits; + +inline int32_t MakePackedId(int32_t FnId, int32_t ObjId) { + return ((ObjId << XRayNFnBits) & XRayObjBitMask) | (FnId & XRayFnBitMask); +} + +inline std::pair UnpackId(int32_t PackedId) { + uint32_t ObjId = (PackedId & XRayObjBitMask) >> XRayNFnBits; + uint32_t FnId = PackedId & XRayFnBitMask; + return {ObjId, FnId}; +} + struct XRaySledMap { const XRaySledEntry *Sleds; size_t Entries; const XRayFunctionSledIndex *SledsIndex; size_t Functions; + XRayTrampolines Trampolines; + bool FromDSO; + bool Loaded; }; bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, void (*Trampoline)()); -bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); +bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, + void (*Trampoline)()); bool patchFunctionTailExit(bool Enable, uint32_t FuncId, - const XRaySledEntry &Sled); -bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); -bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); + const XRaySledEntry &Sled, void (*Trampoline)()); +bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, + void (*Trampoline)()); +bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, + void (*Trampoline)()); } // namespace __xray -extern "C" { -// The following functions have to be defined in assembler, on a per-platform -// basis. See xray_trampoline_*.S files for implementations. -extern void __xray_FunctionEntry(); -extern void __xray_FunctionExit(); -extern void __xray_FunctionTailExit(); -extern void __xray_ArgLoggerEntry(); -extern void __xray_CustomEvent(); -extern void __xray_TypedEvent(); -} - #endif diff --git a/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/compiler-rt/lib/xray/xray_trampoline_x86_64.S index 01098f60eeab..0f480547b52c 100644 --- a/compiler-rt/lib/xray/xray_trampoline_x86_64.S +++ b/compiler-rt/lib/xray/xray_trampoline_x86_64.S @@ -107,6 +107,16 @@ .section __TEXT,__text #endif +.macro LOAD_HANDLER_ADDR handler +#if !defined(XRAY_PIC) + movq ASM_SYMBOL(\handler)(%rip), %rax +#else + movq ASM_SYMBOL(\handler)@GOTPCREL(%rip), %rax + movq (%rax), %rax +#endif +.endm + + //===----------------------------------------------------------------------===// .globl ASM_SYMBOL(__xray_FunctionEntry) @@ -121,7 +131,7 @@ ASM_SYMBOL(__xray_FunctionEntry): // This load has to be atomic, it's concurrent with __xray_patch(). // On x86/amd64, a simple (type-aligned) MOV instruction is enough. - movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE testq %rax, %rax je LOCAL_LABEL(tmp0) @@ -159,7 +169,7 @@ ASM_SYMBOL(__xray_FunctionExit): movupd %xmm1, 16(%rsp) movq %rax, 8(%rsp) movq %rdx, 0(%rsp) - movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE testq %rax,%rax je LOCAL_LABEL(tmp2) @@ -195,7 +205,7 @@ ASM_SYMBOL(__xray_FunctionTailExit): SAVE_REGISTERS ALIGN_STACK_16B - movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE testq %rax,%rax je LOCAL_LABEL(tmp4) @@ -224,12 +234,12 @@ ASM_SYMBOL(__xray_ArgLoggerEntry): ALIGN_STACK_16B // Again, these function pointer loads must be atomic; MOV is fine. - movq ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray13XRayArgLoggerE testq %rax, %rax jne LOCAL_LABEL(arg1entryLog) // If [arg1 logging handler] not set, defer to no-arg logging. - movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE testq %rax, %rax je LOCAL_LABEL(arg1entryFail) @@ -268,7 +278,7 @@ ASM_SYMBOL(__xray_CustomEvent): // We take two arguments to this trampoline, which should be in rdi and rsi // already. - movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray22XRayPatchedCustomEventE testq %rax,%rax je LOCAL_LABEL(customEventCleanup) @@ -293,7 +303,7 @@ ASM_SYMBOL(__xray_TypedEvent): // We pass three arguments to this trampoline, which should be in rdi, rsi // and rdx without our intervention. - movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax + LOAD_HANDLER_ADDR _ZN6__xray21XRayPatchedTypedEventE testq %rax,%rax je LOCAL_LABEL(typedEventCleanup) diff --git a/compiler-rt/lib/xray/xray_x86_64.cpp b/compiler-rt/lib/xray/xray_x86_64.cpp index b9666a40861d..663a51b26866 100644 --- a/compiler-rt/lib/xray/xray_x86_64.cpp +++ b/compiler-rt/lib/xray/xray_x86_64.cpp @@ -170,7 +170,8 @@ bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, } bool patchFunctionExit(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -192,11 +193,11 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, // Prerequisite is to compute the relative offset fo the // __xray_FunctionExit function's address. const uint64_t Address = Sled.address(); - int64_t TrampolineOffset = reinterpret_cast(__xray_FunctionExit) - + int64_t TrampolineOffset = reinterpret_cast(Trampoline) - (static_cast(Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Exit trampoline (%p) too far from sled (%p)\n", - reinterpret_cast(__xray_FunctionExit), + reinterpret_cast(Trampoline), reinterpret_cast(Address)); return false; } @@ -217,16 +218,16 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, } bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the tail call sled with a similar // sequence as the entry sled, but calls the tail exit sled instead. const uint64_t Address = Sled.address(); - int64_t TrampolineOffset = - reinterpret_cast(__xray_FunctionTailExit) - - (static_cast(Address) + 11); + int64_t TrampolineOffset = reinterpret_cast(Trampoline) - + (static_cast(Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n", - reinterpret_cast(__xray_FunctionTailExit), + reinterpret_cast(Trampoline), reinterpret_cast(Address)); return false; } @@ -247,7 +248,8 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, } bool patchCustomEvent(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -275,7 +277,8 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, } bool patchTypedEvent(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled, + void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp new file mode 100644 index 000000000000..31c615bd1f81 --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp @@ -0,0 +1,47 @@ +// Testing shared library support in basic logging mode. + +// RUN: split-file %s %t +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o + +// RUN: XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s +// RUN: %llvm_xray account --format=csv --sort=funcid "`ls basic-mode-dso-* | head -1`" | FileCheck --check-prefix=ACCOUNT %s +// RUN: rm basic-mode-dso-* + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include +#include + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); + sleep(1); +} + +extern void instrumented_in_dso(); + +int main() { + // Explicit patching to ensure the DSO has been loaded + __xray_patch(); + instrumented_in_executable(); + // CHECK: instrumented_in_executable called + instrumented_in_dso(); + // CHECK-NEXT: instrumented_in_dso called +} + +//--- testlib.cpp + +#include +#include + +[[clang::xray_always_instrument]] void instrumented_in_dso() { + printf("instrumented_in_dso called\n"); +} + +// ACCOUNT: funcid,count,min,median,90%ile,99%ile,max,sum,debug,function +// ACCOUNT-NEXT: 1,1,{{.*}} +// ACCOUNT-NEXT: 16777217,1,{{.*}} diff --git a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp new file mode 100644 index 000000000000..92f3c29e970d --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp @@ -0,0 +1,14 @@ +// Test that the DSO-local runtime library has been linked if -fxray-shared is passed. +// +// RUN: %clangxx -fxray-instrument -fxray-shared %s -shared -o %t.so +// RUN: llvm-nm %t.so | FileCheck %s --check-prefix ENABLED + +// RUN: %clangxx -fxray-instrument %s -shared -o %t.so +// RUN: llvm-nm %t.so | FileCheck %s --check-prefix DISABLED +// +// REQUIRES: target=x86_64{{.*}} + +[[clang::xray_always_instrument]] int always_instrumented() { return 42; } + +// ENABLED: __start_xray_instr_map +// DISABLED-NOT: __start_xray_instr_map diff --git a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp new file mode 100644 index 000000000000..9db411d5ff1c --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp @@ -0,0 +1,107 @@ +// Check that we can patch and un-patch DSOs loaded with dlopen. +// + +// RUN: split-file %s %t +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so +// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp -o %t/main.o +// +// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlib.so 2>&1 | FileCheck %s + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include +#include + +void test_handler(int32_t fid, XRayEntryType type) { + printf("called: %d, type=%d\n", fid, static_cast(type)); +} + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); +} + +typedef void (*dso_func_type)(); + +int main(int argc, char **argv) { + if (argc < 2) { + printf("Shared library argument missing\n"); + // CHECK-NOT: Shared library argument missing + return 1; + } + + const char *dso_path = argv[1]; + + void *dso_handle = dlopen(dso_path, RTLD_LAZY); + if (!dso_handle) { + printf("Failed to load shared library\n"); + char *error = dlerror(); + if (error) { + fprintf(stderr, "%s\n", error); + return 1; + } + return 1; + } + + dso_func_type instrumented_in_dso = + (dso_func_type)dlsym(dso_handle, "_Z19instrumented_in_dsov"); + if (!instrumented_in_dso) { + printf("Failed to find symbol\n"); + char *error = dlerror(); + if (error) { + fprintf(stderr, "%s\n", error); + return 1; + } + return 1; + } + + __xray_set_handler(test_handler); + + instrumented_in_executable(); + // CHECK: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{.*}}, type=1 + instrumented_in_dso(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_dso called + // CHECK-NEXT: called: {{.*}}, type=1 + + auto status = __xray_unpatch(); + printf("unpatching status: %d\n", static_cast(status)); + // CHECK-NEXT: unpatching status: 1 + + instrumented_in_executable(); + // CHECK-NEXT: instrumented_in_executable called + instrumented_in_dso(); + // CHECK-NEXT: instrumented_in_dso called + + status = __xray_patch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 + + instrumented_in_executable(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{.*}}, type=1 + instrumented_in_dso(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_dso called + // CHECK-NEXT: called: {{.*}}, type=1 + + dlclose(dso_handle); + + status = __xray_unpatch(); + printf("unpatching status: %d\n", static_cast(status)); + // CHECK-NEXT: unpatching status: 1 +} + +//--- testlib.cpp + +#include + +[[clang::xray_always_instrument]] void instrumented_in_dso() { + printf("instrumented_in_dso called\n"); +} diff --git a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp new file mode 100644 index 000000000000..89da2764c35c --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp @@ -0,0 +1,197 @@ +// Check that loading libraries with different modes (RTLD_LOCAL/RTLD_GLOBAL) +// and dependencies on other DSOs work correctly. +// + +// RUN: split-file %s %t +// +// Build shared libs with dependencies b->c and e->f +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testliba.cpp -o %t/testliba.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibc.cpp -o %t/testlibc.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibb.cpp %t/testlibc.so -o %t/testlibb.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibd.cpp -o %t/testlibd.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibf.cpp -o %t/testlibf.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibe.cpp %t/testlibf.so -o %t/testlibe.so +// +// Executable links with a and b explicitly and loads d and e at runtime. +// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testliba.so %t/testlibb.so -o %t/main.o +// +// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlibd.so %t/testlibe.so 2>&1 | FileCheck %s + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include +#include + +[[clang::xray_never_instrument]] void test_handler(int32_t fid, + XRayEntryType type) { + printf("called: %d, object=%d, fn=%d, type=%d\n", fid, (fid >> 24) & 0xFF, + fid & 0x00FFFFFF, static_cast(type)); +} + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); +} + +typedef void (*dso_func_type)(); + +[[clang::xray_never_instrument]] void *load_dso(const char *path, int mode) { + void *dso_handle = dlopen(path, mode); + if (!dso_handle) { + printf("failed to load shared library\n"); + char *error = dlerror(); + if (error) { + fprintf(stderr, "%s\n", error); + } + return nullptr; + } + return dso_handle; +} + +[[clang::xray_never_instrument]] void find_and_call(void *dso_handle, + const char *fn) { + dso_func_type dso_fn = (dso_func_type)dlsym(dso_handle, fn); + if (!dso_fn) { + printf("failed to find symbol\n"); + char *error = dlerror(); + if (error) { + fprintf(stderr, "%s\n", error); + } + return; + } + dso_fn(); +} + +extern void a(); +extern void b(); + +int main(int argc, char **argv) { + + if (argc < 3) { + printf("Shared library arguments missing\n"); + // CHECK-NOT: Shared library arguments missing + return 1; + } + + const char *dso_path_d = argv[1]; + const char *dso_path_e = argv[2]; + + __xray_set_handler(test_handler); + + instrumented_in_executable(); + // CHECK: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=1 + + a(); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: a called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1]], fn=1, type=1 + + // Make sure this object ID does not appear again + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ1]] + + b(); // b calls c + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: b called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: c called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ3]] + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ2]] + + // Now check explicit loading with RTLD_LOCAL + + void *dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_LOCAL); + void *dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_LOCAL); + // CHECK-NOT: failed to load shared library + + find_and_call(dso_handle_d, "_Z1dv"); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: d called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ4]] + + find_and_call(dso_handle_e, "_Z1ev"); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: e called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: f called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ6]] + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ5]] + + // Unload DSOs + dlclose(dso_handle_d); + dlclose(dso_handle_e); + + // Repeat test with RTLD_GLOBAL + dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_GLOBAL); + dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_GLOBAL); + // CHECK-NOT: failed to load shared library + + find_and_call(dso_handle_d, "_Z1dv"); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: d called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ7]] + + find_and_call(dso_handle_e, "_Z1ev"); + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: e called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9:[0-9]+]], fn=1, type=0 + // CHECK-NEXT: f called + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ9]] + // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8]], fn=1, type=1 + // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ8]] + + auto status = __xray_unpatch(); + printf("unpatching status: %d\n", static_cast(status)); + // CHECK-NEXT: unpatching status: 1 + + dlclose(dso_handle_d); + dlclose(dso_handle_e); +} + +//--- libgenmacro.inc +#include +// Helper macros to quickly generate libraries containing a single function. +#define GENERATE_LIB(NAME) \ + [[clang::xray_always_instrument]] void NAME() { printf(#NAME " called\n"); } + +#define GENERATE_LIB_WITH_CALL(NAME, FN) \ + extern void FN(); \ + [[clang::xray_always_instrument]] void NAME() { \ + printf(#NAME " called\n"); \ + FN(); \ + } + +//--- testliba.cpp +#include "libgenmacro.inc" +GENERATE_LIB(a) + +//--- testlibb.cpp +#include "libgenmacro.inc" +GENERATE_LIB_WITH_CALL(b, c) + +//--- testlibc.cpp +#include "libgenmacro.inc" +GENERATE_LIB(c) + +//--- testlibd.cpp +#include "libgenmacro.inc" +GENERATE_LIB(d) + +//--- testlibe.cpp +#include "libgenmacro.inc" +GENERATE_LIB_WITH_CALL(e, f) + +//--- testlibf.cpp +#include "libgenmacro.inc" +GENERATE_LIB(f) diff --git a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp new file mode 100644 index 000000000000..0708d0383439 --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp @@ -0,0 +1,45 @@ +// Checking that DSOs are automatically patched upon load, if patch_premain is passed. + +// RUN: split-file %s %t +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o + +// RUN: XRAY_OPTIONS="patch_premain=true,verbosity=1" %run %t/main.o 2>&1 | FileCheck %s + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include + +void test_handler(int32_t fid, XRayEntryType type) { + printf("called: %d, type=%d\n", fid, static_cast(type)); +} + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); +} + +extern void instrumented_in_dso(); + +int main() { + __xray_set_handler(test_handler); + instrumented_in_executable(); + // CHECK: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{.*}}, type=1 + instrumented_in_dso(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_dso called + // CHECK-NEXT: called: {{.*}}, type=1 +} + +//--- testlib.cpp + +#include + +[[clang::xray_always_instrument]] void instrumented_in_dso() { + printf("instrumented_in_dso called\n"); +} diff --git a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp new file mode 100644 index 000000000000..d3e992dd4977 --- /dev/null +++ b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp @@ -0,0 +1,75 @@ +// Check that we can patch and un-patch on demand, and that logging gets invoked +// appropriately. +// + +// RUN: split-file %s %t +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so +// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o + +// RUN: XRAY_OPTIONS="patch_premain=false" %run %t/main.o 2>&1 | FileCheck %s + +// REQUIRES: target=x86_64{{.*}} + +//--- main.cpp + +#include "xray/xray_interface.h" + +#include + +bool called = false; + +void test_handler(int32_t fid, XRayEntryType type) { + printf("called: %d, type=%d\n", fid, static_cast(type)); + called = true; +} + +[[clang::xray_always_instrument]] void instrumented_in_executable() { + printf("instrumented_in_executable called\n"); +} + +extern void instrumented_in_dso(); + +int main() { + __xray_set_handler(test_handler); + instrumented_in_executable(); + // CHECK: instrumented_in_executable called + instrumented_in_dso(); + // CHECK: instrumented_in_dso called + auto status = __xray_patch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 + instrumented_in_executable(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_executable called + // CHECK-NEXT: called: {{.*}}, type=1 + instrumented_in_dso(); + // CHECK-NEXT: called: {{.*}}, type=0 + // CHECK-NEXT: instrumented_in_dso called + // CHECK-NEXT: called: {{.*}}, type=1 + status = __xray_unpatch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 + instrumented_in_executable(); + // CHECK-NEXT: instrumented_in_executable called + instrumented_in_dso(); + // CHECK-NEXT: instrumented_in_dso called + status = __xray_patch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 + __xray_remove_handler(); + instrumented_in_executable(); + // CHECK-NEXT: instrumented_in_executable called + instrumented_in_dso(); + // CHECK-NEXT: instrumented_in_dso called + status = __xray_unpatch(); + printf("patching status: %d\n", static_cast(status)); + // CHECK-NEXT: patching status: 1 +} + +//--- testlib.cpp + +#include + +[[clang::xray_always_instrument]] void instrumented_in_dso() { + printf("instrumented_in_dso called\n"); +} -- GitLab From 42ec740d0347a89b656c9be5ac4a7e4d8bcd30d5 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 11 Oct 2024 11:36:55 +0200 Subject: [PATCH 012/345] [clang][ExprConstant] Remove an outdated TODO comment (#111959) Seems like passing the quantities directly seems to work fine. --- clang/lib/AST/ExprConstant.cpp | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 4d5af96093cf..06e653f96d6d 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -9768,11 +9768,8 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, if (BaseAlignment < Align) { Result.Designator.setInvalid(); - // FIXME: Add support to Diagnostic for long / long long. - CCEDiag(E->getArg(0), - diag::note_constexpr_baa_insufficient_alignment) << 0 - << (unsigned)BaseAlignment.getQuantity() - << (unsigned)Align.getQuantity(); + CCEDiag(E->getArg(0), diag::note_constexpr_baa_insufficient_alignment) + << 0 << BaseAlignment.getQuantity() << Align.getQuantity(); return false; } } @@ -9783,11 +9780,11 @@ bool PointerExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E, (OffsetResult.Base ? CCEDiag(E->getArg(0), - diag::note_constexpr_baa_insufficient_alignment) << 1 + diag::note_constexpr_baa_insufficient_alignment) + << 1 : CCEDiag(E->getArg(0), diag::note_constexpr_baa_value_insufficient_alignment)) - << (int)OffsetResult.Offset.getQuantity() - << (unsigned)Align.getQuantity(); + << OffsetResult.Offset.getQuantity() << Align.getQuantity(); return false; } -- GitLab From 7b0d56be1d002e9cf0d8dda8ecaee99c5dbc88cf Mon Sep 17 00:00:00 2001 From: Petar Avramovic Date: Fri, 11 Oct 2024 11:40:27 +0200 Subject: [PATCH 013/345] AMDGPU/GlobalISel: Fix inst-selection of ballot (#109986) Both input and output of ballot are lane-masks: result is lane-mask with 'S32/S64 LLT and SGPR bank' input is lane-mask with 'S1 LLT and VCC reg bank'. Ballot copies bits from input lane-mask for all active lanes and puts 0 for inactive lanes. GlobalISel did not set 0 in result for inactive lanes for non-constant input. --- llvm/docs/AMDGPUUsage.rst | 6 ++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 + .../AMDGPU/AMDGPUInstructionSelector.cpp | 101 +++++++++++++----- .../GlobalISel/llvm.amdgcn.ballot.i32.ll | 90 +++++++++++++++- .../GlobalISel/llvm.amdgcn.ballot.i64.ll | 58 +++++++++- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll | 77 ++++++++++++- .../CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll | 47 ++++++++ .../AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll | 20 ++-- 8 files changed, 360 insertions(+), 41 deletions(-) diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 6ff3272422fe..aba39762861d 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -1369,6 +1369,12 @@ The AMDGPU backend implements the following LLVM IR intrinsics. sign-extended from the width of the underlying PC hardware register even on processors where the s_getpc_b64 instruction returns a zero-extended value. + llvm.amdgcn.ballot Returns a bitfield(i32 or i64) containing the result of its i1 argument + in all active lanes, and zero in all inactive lanes. + Provides a way to convert i1 in LLVM IR to i32 or i64 lane mask - bitfield + used by hardware to control active lanes when used in EXEC register. + For example, ballot(i1 true) return EXEC mask. + ============================================== ========================================================== .. TODO:: diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 2738eb77b675..715f2cc917e2 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2086,6 +2086,8 @@ def int_amdgcn_fcmp : [IntrNoMem, IntrConvergent, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +// Returns a bitfield(i32 or i64) containing the result of its i1 argument +// in all active lanes, and zero in all inactive lanes. def int_amdgcn_ballot : Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5be0a049cc58..53628981e124 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1413,50 +1413,101 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const { return true; } +// Ballot has to zero bits in input lane-mask that are zero in current exec, +// Done as AND with exec. For inputs that are results of instruction that +// implicitly use same exec, for example compares in same basic block or SCC to +// VCC copy, use copy. +static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, + MachineBasicBlock *MBB) { + MachineInstr *MI = MRI.getVRegDef(Reg); + if (MI->getParent() != MBB) + return false; + + // Lane mask generated by SCC to VCC copy. + if (MI->getOpcode() == AMDGPU::COPY) { + auto DstRB = MRI.getRegBankOrNull(MI->getOperand(0).getReg()); + auto SrcRB = MRI.getRegBankOrNull(MI->getOperand(1).getReg()); + if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID && + SrcRB->getID() == AMDGPU::SGPRRegBankID) + return true; + } + + // Lane mask generated using compare with same exec. + if (isa(MI)) + return true; + + Register LHS, RHS; + // Look through AND. + if (mi_match(Reg, MRI, m_GAnd(m_Reg(LHS), m_Reg(RHS)))) + return isLaneMaskFromSameBlock(LHS, MRI, MBB) || + isLaneMaskFromSameBlock(RHS, MRI, MBB); + + return false; +} + bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { MachineBasicBlock *BB = I.getParent(); const DebugLoc &DL = I.getDebugLoc(); Register DstReg = I.getOperand(0).getReg(); - const unsigned Size = MRI->getType(DstReg).getSizeInBits(); - const bool Is64 = Size == 64; - const bool IsWave32 = (STI.getWavefrontSize() == 32); + Register SrcReg = I.getOperand(2).getReg(); + const unsigned BallotSize = MRI->getType(DstReg).getSizeInBits(); + const unsigned WaveSize = STI.getWavefrontSize(); // In the common case, the return type matches the wave size. // However we also support emitting i64 ballots in wave32 mode. - if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32)) + if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32)) return false; std::optional Arg = - getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); + getIConstantVRegValWithLookThrough(SrcReg, *MRI); + + Register Dst = DstReg; + // i64 ballot on Wave32: new Dst(i32) for WaveSize ballot. + if (BallotSize != WaveSize) { + Dst = MRI->createVirtualRegister(TRI.getBoolRC()); + } - const auto BuildCopy = [&](Register SrcReg) { - if (Size == STI.getWavefrontSize()) { - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) - .addReg(SrcReg); - return; + if (Arg) { + const int64_t Value = Arg->Value.getZExtValue(); + if (Value == 0) { + // Dst = S_MOV 0 + unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BB, &I, DL, TII.get(Opcode), Dst).addImm(0); + } else { + // Dst = COPY EXEC + assert(Value == 1); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec()); } + if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) + return false; + } else { + if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) { + // Dst = COPY SrcReg + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg); + if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI)) + return false; + } else { + // Dst = S_AND SrcReg, EXEC + unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32; + auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), Dst) + .addReg(SrcReg) + .addReg(TRI.getExec()) + .setOperandDead(3); // Dead scc + if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI)) + return false; + } + } - // If emitting a i64 ballot in wave32, fill the upper bits with zeroes. + // i64 ballot on Wave32: zero-extend i32 ballot to i64. + if (BallotSize != WaveSize) { Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) - .addReg(SrcReg) + .addReg(Dst) .addImm(AMDGPU::sub0) .addReg(HiReg) .addImm(AMDGPU::sub1); - }; - - if (Arg) { - const int64_t Value = Arg->Value.getSExtValue(); - if (Value == 0) { - unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; - BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); - } else if (Value == -1) // all ones - BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); - else - return false; - } else - BuildCopy(I.getOperand(2).getReg()); + } I.eraseFromParent(); return true; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll index 96cab200b61c..2edcf23df411 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,GFX11 %s declare i32 @llvm.amdgcn.ballot.i32(i1) declare i32 @llvm.ctpop.i32(i32) @@ -33,7 +33,8 @@ define amdgpu_cs i32 @non_compare(i32 %x) { ; CHECK-LABEL: non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo ; CHECK-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc) @@ -89,7 +90,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_cmp_eq_u32 vcc_lo, 0 +; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_eq_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -137,7 +139,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; CHECK-NEXT: s_cmp_lg_u32 vcc_lo, 0 +; CHECK-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; CHECK-NEXT: s_cmp_lg_u32 s0, 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -419,3 +422,80 @@ true: false: ret i32 33 } + +; Input that is not constant or direct result of a compare. +; Tests setting 0 to inactive lanes. +define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; GFX10-LABEL: non_cst_non_compare_input: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-NEXT: ; %bb.1: ; %B +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 +; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX10-NEXT: ; %bb.3: ; %A +; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 +; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: s_and_b32 s2, exec_lo, vcc_lo +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; %bb.4: ; %exit +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: s_and_b32 s0, s0, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: non_cst_non_compare_input: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11-NEXT: ; %bb.1: ; %B +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX11-NEXT: ; %bb.3: ; %A +; GFX11-NEXT: v_cmp_le_u32_e32 vcc_lo, 1, v2 +; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo +; GFX11-NEXT: s_and_b32 s2, exec_lo, vcc_lo +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: ; %bb.4: ; %exit +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: s_and_b32 s0, s0, exec_lo +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi) + store i32 %ballot, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll index a18f84344044..0bbb40b8db43 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -34,7 +34,8 @@ define amdgpu_cs i64 @non_compare(i32 %x) { ; CHECK-LABEL: non_compare: ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec ; CHECK-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) @@ -92,7 +93,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_ne_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cmp_eq_u64 vcc, 0 +; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc1 .LBB7_2 ; CHECK-NEXT: ; %bb.1: ; %true ; CHECK-NEXT: s_mov_b32 s0, 42 @@ -140,7 +142,8 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) { ; CHECK: ; %bb.0: ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 -; CHECK-NEXT: s_cmp_lg_u64 vcc, 0 +; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 0 ; CHECK-NEXT: s_cbranch_scc0 .LBB9_2 ; CHECK-NEXT: ; %bb.1: ; %false ; CHECK-NEXT: s_mov_b32 s0, 33 @@ -422,3 +425,52 @@ true: false: ret i32 33 } + +; Input that is not constant or direct result of a compare. +; Tests setting 0 to inactive lanes. +define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; CHECK-LABEL: non_cst_non_compare_input: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_and_b32 s0, 1, s0 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, s0 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: ; %bb.1: ; %B +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc +; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; %bb.2: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; CHECK-NEXT: ; %bb.3: ; %A +; CHECK-NEXT: v_cmp_le_u32_e32 vcc, 1, v2 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[4:5], exec, vcc +; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; CHECK-NEXT: ; %bb.4: ; %exit +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_and_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; CHECK-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi) + store i64 %ballot, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll index 047b35b8c0f9..026a8d7da708 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX10 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32 < %s | FileCheck -check-prefixes=CHECK,GFX11 %s declare i32 @llvm.amdgcn.ballot.i32(i1) declare i32 @llvm.ctpop.i32(i32) @@ -522,3 +522,76 @@ true: false: ret i32 33 } + +; Input that is not constant or direct result of a compare. +; Tests setting 0 to inactive lanes. +define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; GFX10-LABEL: non_cst_non_compare_input: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: ; implicit-def: $sgpr0 +; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX10-NEXT: ; %bb.1: ; %B +; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 +; GFX10-NEXT: ; implicit-def: $vgpr2 +; GFX10-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX10-NEXT: ; %bb.2: ; %Flow +; GFX10-NEXT: s_andn2_saveexec_b32 s1, s1 +; GFX10-NEXT: ; %bb.3: ; %A +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: s_andn2_b32 s0, s0, exec_lo +; GFX10-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX10-NEXT: s_or_b32 s0, s0, s2 +; GFX10-NEXT: ; %bb.4: ; %exit +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: non_cst_non_compare_input: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s1, exec_lo +; GFX11-NEXT: ; implicit-def: $sgpr0 +; GFX11-NEXT: v_cmpx_ne_u32_e32 0, v3 +; GFX11-NEXT: s_xor_b32 s1, exec_lo, s1 +; GFX11-NEXT: ; %bb.1: ; %B +; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v2 +; GFX11-NEXT: ; implicit-def: $vgpr2 +; GFX11-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GFX11-NEXT: ; %bb.2: ; %Flow +; GFX11-NEXT: s_and_not1_saveexec_b32 s1, s1 +; GFX11-NEXT: ; %bb.3: ; %A +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: s_and_not1_b32 s0, s0, exec_lo +; GFX11-NEXT: s_and_b32 s2, vcc_lo, exec_lo +; GFX11-NEXT: s_or_b32 s0, s0, s2 +; GFX11-NEXT: ; %bb.4: ; %exit +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_nop 0 +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %phi) + store i32 %ballot, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll index 61f0f20f0570..c7597e98a6d5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -511,3 +511,50 @@ true: false: ret i32 33 } + +; Input that is not constant or direct result of a compare. +; Tests setting 0 to inactive lanes. +define amdgpu_ps void @non_cst_non_compare_input(ptr addrspace(1) %out, i32 %tid, i32 %cond) { +; CHECK-LABEL: non_cst_non_compare_input: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3 +; CHECK-NEXT: ; implicit-def: $sgpr0_sgpr1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; CHECK-NEXT: ; %bb.1: ; %B +; CHECK-NEXT: v_cmp_gt_u32_e32 vcc, 2, v2 +; CHECK-NEXT: s_and_b64 s[0:1], vcc, exec +; CHECK-NEXT: ; implicit-def: $vgpr2 +; CHECK-NEXT: ; %bb.2: ; %Flow +; CHECK-NEXT: s_andn2_saveexec_b64 s[2:3], s[2:3] +; CHECK-NEXT: ; %bb.3: ; %A +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2 +; CHECK-NEXT: s_andn2_b64 s[0:1], s[0:1], exec +; CHECK-NEXT: s_and_b64 s[4:5], vcc, exec +; CHECK-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] +; CHECK-NEXT: ; %bb.4: ; %exit +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v2 +; CHECK-NEXT: v_mov_b32_e32 v3, s1 +; CHECK-NEXT: v_mov_b32_e32 v2, s0 +; CHECK-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; CHECK-NEXT: s_endpgm +entry: + %cmp = icmp eq i32 %cond, 0 + br i1 %cmp, label %A, label %B + +A: + %val_A = icmp uge i32 %tid, 1 + br label %exit + +B: + %val_B = icmp ult i32 %tid, 2 + br label %exit + +exit: + %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ] + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %phi) + store i64 %ballot, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll index 5dbfdf24ef36..fe69dc490624 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll @@ -40,12 +40,20 @@ define amdgpu_cs i64 @constant_true() { ; Test ballot of a non-comparison operation define amdgpu_cs i64 @non_compare(i32 %x) { -; CHECK-LABEL: non_compare: -; CHECK: ; %bb.0: -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: s_mov_b32 s1, 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 -; CHECK-NEXT: ; return to shader part epilog +; DAGISEL-LABEL: non_compare: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; DAGISEL-NEXT: s_mov_b32 s1, 0 +; DAGISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; DAGISEL-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: non_compare: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_and_b32_e32 v0, 1, v0 +; GISEL-NEXT: s_mov_b32 s1, 0 +; GISEL-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GISEL-NEXT: s_and_b32 s0, vcc_lo, exec_lo +; GISEL-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) ret i64 %ballot -- GitLab From 777142937a599d8a9cea5964b415d9cd13016d79 Mon Sep 17 00:00:00 2001 From: Simon Camphausen Date: Fri, 11 Oct 2024 11:45:25 +0200 Subject: [PATCH 014/345] [mlir][EmitC] Fail on memrefs with 0 dims in type conversion (#111965) This let's the type conversion fail instead of generating invalid array types. --- .../Conversion/MemRefToEmitC/MemRefToEmitC.cpp | 4 +++- .../MemRefToEmitC/memref-to-emitc-failed.mlir | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp index 2b7ac4b529cf..39532d34f616 100644 --- a/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp +++ b/mlir/lib/Conversion/MemRefToEmitC/MemRefToEmitC.cpp @@ -167,7 +167,9 @@ void mlir::populateMemRefToEmitCTypeConversion(TypeConverter &typeConverter) { typeConverter.addConversion( [&](MemRefType memRefType) -> std::optional { if (!memRefType.hasStaticShape() || - !memRefType.getLayout().isIdentity() || memRefType.getRank() == 0) { + !memRefType.getLayout().isIdentity() || memRefType.getRank() == 0 || + llvm::any_of(memRefType.getShape(), + [](int64_t dim) { return dim == 0; })) { return {}; } Type convertedElementType = diff --git a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir index dee9cc97a144..fda01974d3fc 100644 --- a/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir +++ b/mlir/test/Conversion/MemRefToEmitC/memref-to-emitc-failed.mlir @@ -41,6 +41,22 @@ func.func @zero_rank() { // ----- +func.func @zero_dim_rank_1() { + // expected-error@+1 {{failed to legalize operation 'memref.alloca'}} + %0 = memref.alloca() : memref<0xf32> + return +} + +// ----- + +func.func @zero_dim_rank_3() { + // expected-error@+1 {{failed to legalize operation 'memref.alloca'}} + %0 = memref.alloca() : memref<2x0x4xf32> + return +} + +// ----- + // expected-error@+1 {{failed to legalize operation 'memref.global'}} memref.global "nested" constant @nested_global : memref<3x7xf32> -- GitLab From 80c15c48d1fbb53478c9400e598abcbdcae0d962 Mon Sep 17 00:00:00 2001 From: Timm Baeder Date: Fri, 11 Oct 2024 11:46:33 +0200 Subject: [PATCH 015/345] [clang][bytecode] Implement __builtin_assume_aligned (#111968) --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 71 ++++++++++++++++++- clang/lib/AST/ExprConstShared.h | 8 +++ clang/lib/AST/ExprConstant.cpp | 35 +++++---- clang/test/Sema/builtin-assume-aligned.c | 2 + clang/test/SemaCXX/builtin-assume-aligned.cpp | 1 + 5 files changed, 98 insertions(+), 19 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 74e9e1cf6293..ec27aebf84bd 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -38,7 +38,6 @@ static T getParam(const InterpFrame *Frame, unsigned Index) { return Frame->getParam(Offset); } -// static APSInt getAPSIntParam(InterpStack &Stk, size_t Offset = 0) { static APSInt getAPSIntParam(const InterpFrame *Frame, unsigned Index) { APSInt R; unsigned Offset = Frame->getFunction()->getParamOffset(Index); @@ -1162,6 +1161,71 @@ static bool interp__builtin_is_aligned_up_down(InterpState &S, CodePtr OpPC, return false; } +/// __builtin_assume_aligned(Ptr, Alignment[, ExtraOffset]) +static bool interp__builtin_assume_aligned(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const Function *Func, + const CallExpr *Call) { + assert(Call->getNumArgs() == 2 || Call->getNumArgs() == 3); + + // Might be called with function pointers in C. + std::optional PtrT = S.Ctx.classify(Call->getArg(0)); + if (PtrT != PT_Ptr) + return false; + + unsigned ArgSize = callArgSize(S, Call); + const Pointer &Ptr = S.Stk.peek(ArgSize); + std::optional ExtraOffset; + APSInt Alignment; + if (Call->getNumArgs() == 2) { + Alignment = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(1))); + } else { + PrimType AlignmentT = *S.Ctx.classify(Call->getArg(1)); + PrimType ExtraOffsetT = *S.Ctx.classify(Call->getArg(2)); + Alignment = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(1)), + align(primSize(AlignmentT)) + + align(primSize(ExtraOffsetT))); + ExtraOffset = peekToAPSInt(S.Stk, *S.Ctx.classify(Call->getArg(2))); + } + + CharUnits Align = CharUnits::fromQuantity(Alignment.getZExtValue()); + + // If there is a base object, then it must have the correct alignment. + if (Ptr.isBlockPointer()) { + CharUnits BaseAlignment; + if (const auto *VD = Ptr.getDeclDesc()->asValueDecl()) + BaseAlignment = S.getASTContext().getDeclAlign(VD); + else if (const auto *E = Ptr.getDeclDesc()->asExpr()) + BaseAlignment = GetAlignOfExpr(S.getASTContext(), E, UETT_AlignOf); + + if (BaseAlignment < Align) { + S.CCEDiag(Call->getArg(0), + diag::note_constexpr_baa_insufficient_alignment) + << 0 << BaseAlignment.getQuantity() << Align.getQuantity(); + return false; + } + } + + APValue AV = Ptr.toAPValue(S.getASTContext()); + CharUnits AVOffset = AV.getLValueOffset(); + if (ExtraOffset) + AVOffset -= CharUnits::fromQuantity(ExtraOffset->getZExtValue()); + if (AVOffset.alignTo(Align) != AVOffset) { + if (Ptr.isBlockPointer()) + S.CCEDiag(Call->getArg(0), + diag::note_constexpr_baa_insufficient_alignment) + << 1 << AVOffset.getQuantity() << Align.getQuantity(); + else + S.CCEDiag(Call->getArg(0), + diag::note_constexpr_baa_value_insufficient_alignment) + << AVOffset.getQuantity() << Align.getQuantity(); + return false; + } + + S.Stk.push(Ptr); + return true; +} + static bool interp__builtin_ia32_bextr(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const Function *Func, @@ -1905,6 +1969,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F, return false; break; + case Builtin::BI__builtin_assume_aligned: + if (!interp__builtin_assume_aligned(S, OpPC, Frame, F, Call)) + return false; + break; + case clang::X86::BI__builtin_ia32_bextr_u32: case clang::X86::BI__builtin_ia32_bextr_u64: case clang::X86::BI__builtin_ia32_bextri_u32: diff --git a/clang/lib/AST/ExprConstShared.h b/clang/lib/AST/ExprConstShared.h index efe8ee986d29..401ae629c86b 100644 --- a/clang/lib/AST/ExprConstShared.h +++ b/clang/lib/AST/ExprConstShared.h @@ -14,12 +14,17 @@ #ifndef LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H #define LLVM_CLANG_LIB_AST_EXPRCONSTSHARED_H +#include "clang/Basic/TypeTraits.h" + namespace llvm { class APFloat; } namespace clang { class QualType; class LangOptions; +class ASTContext; +class CharUnits; +class Expr; } // namespace clang using namespace clang; /// Values returned by __builtin_classify_type, chosen to match the values @@ -66,4 +71,7 @@ void HandleComplexComplexDiv(llvm::APFloat A, llvm::APFloat B, llvm::APFloat C, llvm::APFloat D, llvm::APFloat &ResR, llvm::APFloat &ResI); +CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E, + UnaryExprOrTypeTrait ExprKind); + #endif diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 06e653f96d6d..70b223596d8b 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -9620,7 +9620,7 @@ bool PointerExprEvaluator::VisitCastExpr(const CastExpr *E) { return ExprEvaluatorBaseTy::VisitCastExpr(E); } -static CharUnits GetAlignOfType(EvalInfo &Info, QualType T, +static CharUnits GetAlignOfType(const ASTContext &Ctx, QualType T, UnaryExprOrTypeTrait ExprKind) { // C++ [expr.alignof]p3: // When alignof is applied to a reference type, the result is the @@ -9631,23 +9631,22 @@ static CharUnits GetAlignOfType(EvalInfo &Info, QualType T, return CharUnits::One(); const bool AlignOfReturnsPreferred = - Info.Ctx.getLangOpts().getClangABICompat() <= LangOptions::ClangABI::Ver7; + Ctx.getLangOpts().getClangABICompat() <= LangOptions::ClangABI::Ver7; // __alignof is defined to return the preferred alignment. // Before 8, clang returned the preferred alignment for alignof and _Alignof // as well. if (ExprKind == UETT_PreferredAlignOf || AlignOfReturnsPreferred) - return Info.Ctx.toCharUnitsFromBits( - Info.Ctx.getPreferredTypeAlign(T.getTypePtr())); + return Ctx.toCharUnitsFromBits(Ctx.getPreferredTypeAlign(T.getTypePtr())); // alignof and _Alignof are defined to return the ABI alignment. else if (ExprKind == UETT_AlignOf) - return Info.Ctx.getTypeAlignInChars(T.getTypePtr()); + return Ctx.getTypeAlignInChars(T.getTypePtr()); else llvm_unreachable("GetAlignOfType on a non-alignment ExprKind"); } -static CharUnits GetAlignOfExpr(EvalInfo &Info, const Expr *E, - UnaryExprOrTypeTrait ExprKind) { +CharUnits GetAlignOfExpr(const ASTContext &Ctx, const Expr *E, + UnaryExprOrTypeTrait ExprKind) { E = E->IgnoreParens(); // The kinds of expressions that we have special-case logic here for @@ -9657,22 +9656,22 @@ static CharUnits GetAlignOfExpr(EvalInfo &Info, const Expr *E, // alignof decl is always accepted, even if it doesn't make sense: we default // to 1 in those cases. if (const DeclRefExpr *DRE = dyn_cast(E)) - return Info.Ctx.getDeclAlign(DRE->getDecl(), - /*RefAsPointee*/true); + return Ctx.getDeclAlign(DRE->getDecl(), + /*RefAsPointee*/ true); if (const MemberExpr *ME = dyn_cast(E)) - return Info.Ctx.getDeclAlign(ME->getMemberDecl(), - /*RefAsPointee*/true); + return Ctx.getDeclAlign(ME->getMemberDecl(), + /*RefAsPointee*/ true); - return GetAlignOfType(Info, E->getType(), ExprKind); + return GetAlignOfType(Ctx, E->getType(), ExprKind); } static CharUnits getBaseAlignment(EvalInfo &Info, const LValue &Value) { if (const auto *VD = Value.Base.dyn_cast()) return Info.Ctx.getDeclAlign(VD); if (const auto *E = Value.Base.dyn_cast()) - return GetAlignOfExpr(Info, E, UETT_AlignOf); - return GetAlignOfType(Info, Value.Base.getTypeInfoType(), UETT_AlignOf); + return GetAlignOfExpr(Info.Ctx, E, UETT_AlignOf); + return GetAlignOfType(Info.Ctx, Value.Base.getTypeInfoType(), UETT_AlignOf); } /// Evaluate the value of the alignment argument to __builtin_align_{up,down}, @@ -14475,11 +14474,11 @@ bool IntExprEvaluator::VisitUnaryExprOrTypeTraitExpr( case UETT_PreferredAlignOf: case UETT_AlignOf: { if (E->isArgumentType()) - return Success(GetAlignOfType(Info, E->getArgumentType(), E->getKind()), - E); + return Success( + GetAlignOfType(Info.Ctx, E->getArgumentType(), E->getKind()), E); else - return Success(GetAlignOfExpr(Info, E->getArgumentExpr(), E->getKind()), - E); + return Success( + GetAlignOfExpr(Info.Ctx, E->getArgumentExpr(), E->getKind()), E); } case UETT_PtrAuthTypeDiscriminator: { diff --git a/clang/test/Sema/builtin-assume-aligned.c b/clang/test/Sema/builtin-assume-aligned.c index c2e4f9d659dd..33e855784515 100644 --- a/clang/test/Sema/builtin-assume-aligned.c +++ b/clang/test/Sema/builtin-assume-aligned.c @@ -1,5 +1,7 @@ // RUN: %clang_cc1 -DSIZE_T_64 -fsyntax-only -Wno-strict-prototypes -triple x86_64-linux -verify %s // RUN: %clang_cc1 -fsyntax-only -Wno-strict-prototypes -triple i386-freebsd -verify %s +// RUN: %clang_cc1 -DSIZE_T_64 -fsyntax-only -Wno-strict-prototypes -triple x86_64-linux -verify %s -fexperimental-new-constant-interpreter +// RUN: %clang_cc1 -fsyntax-only -Wno-strict-prototypes -triple i386-freebsd -verify %s -fexperimental-new-constant-interpreter // __builtin_assume_aligned's second parameter is size_t, which may be 32 bits, // so test differently when size_t is 32 bits and when it is 64 bits. diff --git a/clang/test/SemaCXX/builtin-assume-aligned.cpp b/clang/test/SemaCXX/builtin-assume-aligned.cpp index 48bd8414fc50..85a7faee9161 100644 --- a/clang/test/SemaCXX/builtin-assume-aligned.cpp +++ b/clang/test/SemaCXX/builtin-assume-aligned.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -triple x86_64-linux-gnu %s +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 -triple x86_64-linux-gnu %s -fexperimental-new-constant-interpreter int n; constexpr int *p = 0; -- GitLab From 73ad416ebf9d11b876f22ede0ee90f660192869f Mon Sep 17 00:00:00 2001 From: Dominik Adamski Date: Fri, 11 Oct 2024 11:53:28 +0200 Subject: [PATCH 016/345] [OpenMP][Flang] Enable alias analysis inside omp target region (#111670) At present, alias analysis does not work for operations inside OMP target regions because the FIR declare operations within OMP target do not offer sufficient information for alias analysis. Consequently, it is necessary to examine the FIR code outside the OMP target region. --- .../lib/Optimizer/Analysis/AliasAnalysis.cpp | 29 ++++++ flang/lib/Optimizer/Analysis/CMakeLists.txt | 2 + .../alias-analysis-omp-target-1.fir | 66 +++++++++++++ .../alias-analysis-omp-target-2.fir | 96 +++++++++++++++++++ 4 files changed, 193 insertions(+) create mode 100644 flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir create mode 100644 flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index e88da5a8ebae..6ee4f0ff7105 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -13,6 +13,8 @@ #include "flang/Optimizer/Dialect/FortranVariableInterface.h" #include "flang/Optimizer/HLFIR/HLFIROps.h" #include "mlir/Analysis/AliasAnalysis.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/Dialect/OpenMP/OpenMPInterfaces.h" #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/Value.h" #include "mlir/Interfaces/SideEffectInterfaces.h" @@ -296,6 +298,17 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, defOp = v.getDefiningOp(); return; } + // If load is inside target and it points to mapped item, + // continue tracking. + Operation *loadMemrefOp = op.getMemref().getDefiningOp(); + bool isDeclareOp = llvm::isa(loadMemrefOp) || + llvm::isa(loadMemrefOp); + if (isDeclareOp && + llvm::isa(loadMemrefOp->getParentOp())) { + v = op.getMemref(); + defOp = v.getDefiningOp(); + return; + } // No further tracking for addresses loaded from memory for now. type = SourceKind::Indirect; breakFromLoop = true; @@ -319,6 +332,22 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, breakFromLoop = true; }) .Case([&](auto op) { + // If declare operation is inside omp target region, + // continue alias analysis outside the target region + if (auto targetOp = + llvm::dyn_cast(op->getParentOp())) { + auto argIface = cast(*targetOp); + for (auto [opArg, blockArg] : llvm::zip_equal( + targetOp.getMapVars(), argIface.getMapBlockArgs())) { + if (blockArg == op.getMemref()) { + omp::MapInfoOp mapInfo = + llvm::cast(opArg.getDefiningOp()); + v = mapInfo.getVarPtr(); + defOp = v.getDefiningOp(); + return; + } + } + } auto varIf = llvm::cast(defOp); // While going through a declare operation collect // the variable attributes from it. Right now, some diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt index 436d4d3f1896..c000a9da99f8 100644 --- a/flang/lib/Optimizer/Analysis/CMakeLists.txt +++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt @@ -6,6 +6,7 @@ add_flang_library(FIRAnalysis FIRDialect HLFIRDialect MLIRIR + MLIROpenMPDialect LINK_LIBS FIRBuilder @@ -14,5 +15,6 @@ add_flang_library(FIRAnalysis MLIRFuncDialect MLIRLLVMDialect MLIRMathTransforms + MLIROpenMPDialect FIRSupport ) diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir new file mode 100644 index 000000000000..88f411847172 --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-1.fir @@ -0,0 +1,66 @@ +// Use --mlir-disable-threading so that the AA queries are serialized +// as well as its diagnostic output. +// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s + +// Fortran source code: +// +// program TestAllocatableArray +// real(kind=8), allocatable :: A(:) +// real(kind=8), allocatable :: B(:) +// !$omp target +// A(0) = B(0) +// !$omp end target +// end TestAllocatableArray + +// CHECK-LABEL: Testing : "_QPTestAllocatableArray" +// CHECK-DAG: targetArrayB#0 <-> targetArrayA#0: NoAlias +func.func @_QPTestAllocatableArray() { + %0 = fir.address_of(@_QFEa) : !fir.ref>>> + %1:2 = hlfir.declare %0 {fortran_attrs = #fir.var_attrs, uniq_name = "ArrayA" } : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %2 = fir.address_of(@_QFEb) : !fir.ref>>> + %3:2 = hlfir.declare %2 {fortran_attrs = #fir.var_attrs, uniq_name = "ArrayB" } : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %4 = fir.load %1#0 : !fir.ref>>> + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %5 = fir.load %1#1 : !fir.ref>>> + %c0_0 = arith.constant 0 : index + %6:3 = fir.box_dims %5, %c0_0 : (!fir.box>>, index) -> (index, index, index) + %7:3 = fir.box_dims %4, %c0 : (!fir.box>>, index) -> (index, index, index) + %c0_1 = arith.constant 0 : index + %8 = arith.subi %7#1, %c1 : index + %9 = omp.map.bounds lower_bound(%c0_1 : index) upper_bound(%8 : index) extent(%7#1 : index) stride(%7#2 : index) start_idx(%6#0 : index) {stride_in_bytes = true} + %10 = fir.box_offset %1#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> + %11 = omp.map.info var_ptr(%1#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%10 : !fir.llvm_ptr>>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%9) -> !fir.llvm_ptr>> {name = ""} + %12 = omp.map.info var_ptr(%1#1 : !fir.ref>>>, !fir.box>>) map_clauses(implicit, tofrom) capture(ByRef) members(%11 : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "a"} + %13 = fir.load %3#0 : !fir.ref>>> + %c1_2 = arith.constant 1 : index + %c0_3 = arith.constant 0 : index + %14 = fir.load %3#1 : !fir.ref>>> + %c0_4 = arith.constant 0 : index + %15:3 = fir.box_dims %14, %c0_4 : (!fir.box>>, index) -> (index, index, index) + %16:3 = fir.box_dims %13, %c0_3 : (!fir.box>>, index) -> (index, index, index) + %c0_5 = arith.constant 0 : index + %17 = arith.subi %16#1, %c1_2 : index + %18 = omp.map.bounds lower_bound(%c0_5 : index) upper_bound(%17 : index) extent(%16#1 : index) stride(%16#2 : index) start_idx(%15#0 : index) {stride_in_bytes = true} + %19 = fir.box_offset %3#1 base_addr : (!fir.ref>>>) -> !fir.llvm_ptr>> + %20 = omp.map.info var_ptr(%3#1 : !fir.ref>>>, !fir.array) var_ptr_ptr(%19 : !fir.llvm_ptr>>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%18) -> !fir.llvm_ptr>> {name = ""} + %21 = omp.map.info var_ptr(%3#1 : !fir.ref>>>, !fir.box>>) map_clauses(implicit, tofrom) capture(ByRef) members(%20 : [0] : !fir.llvm_ptr>>) -> !fir.ref>>> {name = "b"} + omp.target map_entries(%11 -> %arg0, %12 -> %arg1, %20 -> %arg2, %21 -> %arg3 : !fir.llvm_ptr>>, !fir.ref>>>, !fir.llvm_ptr>>, !fir.ref>>>) { + %22:2 = hlfir.declare %arg1 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEa"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %23:2 = hlfir.declare %arg3 {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEb"} : (!fir.ref>>>) -> (!fir.ref>>>, !fir.ref>>>) + %24 = fir.load %23#0 : !fir.ref>>> + %c0_6 = arith.constant 0 : index + %25 = hlfir.designate %24 (%c0_6) {test.ptr = "targetArrayB"} : (!fir.box>>, index) -> !fir.ref + %26 = fir.load %25 : !fir.ref + %27 = fir.load %22#0 : !fir.ref>>> + %c0_7 = arith.constant 0 : index + %28 = hlfir.designate %27 (%c0_7) {test.ptr = "targetArrayA"} : (!fir.box>>, index) -> !fir.ref + hlfir.assign %26 to %28 : f64, !fir.ref + omp.terminator + } + return +} +fir.global internal @_QFEa : !fir.box>> { +} +fir.global internal @_QFEb : !fir.box>> { +} diff --git a/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir new file mode 100644 index 000000000000..c6b2e29a7188 --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/alias-analysis-omp-target-2.fir @@ -0,0 +1,96 @@ +// Use --mlir-disable-threading so that the AA queries are serialized +// as well as its diagnostic output. +// RUN: fir-opt %s -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' -split-input-file --mlir-disable-threading 2>&1 | FileCheck %s + +// Fortran source code: +// +// subroutine TestTargetData(p, a, b) +// real :: p(10), a(10), b(10) +// !$omp target data map(from: p) +// !$omp target map(to: a ) +// p(1) = a(1) +// !$omp end target +// !$omp target map(to: b ) +// p(1) = b(1) +// !$omp end target +// !$omp end target data +// end subroutine + +// CHECK-LABEL: Testing : "_QPTestTargetData" + +// CHECK-DAG: targetArrayA#0 <-> targetArrayP#0: NoAlias +// CHECK-DAG: targetArrayA#0 <-> targetArrayB#0: NoAlias +// CHECK-DAG: targetArrayP#0 <-> targetArrayB#0: NoAlias + +func.func @_QPTestTargetData(%arg0: !fir.ref> {fir.bindc_name = "p"}, %arg1: !fir.ref> {fir.bindc_name = "a"}, %arg2: !fir.ref> {fir.bindc_name = "b"}) { + %0 = fir.dummy_scope : !fir.dscope + %c10 = arith.constant 10 : index + %1 = fir.shape %c10 : (index) -> !fir.shape<1> + %2:2 = hlfir.declare %arg1(%1) dummy_scope %0 {uniq_name = "_QFtest_target_dataEa"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %c10_0 = arith.constant 10 : index + %3 = fir.shape %c10_0 : (index) -> !fir.shape<1> + %4:2 = hlfir.declare %arg2(%3) dummy_scope %0 {uniq_name = "_QFtest_target_dataEb"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %c10_1 = arith.constant 10 : index + %5 = fir.shape %c10_1 : (index) -> !fir.shape<1> + %6:2 = hlfir.declare %arg0(%5) dummy_scope %0 {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref>, !fir.shape<1>, !fir.dscope) -> (!fir.ref>, !fir.ref>) + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %7 = arith.subi %c10_1, %c1 : index + %8 = omp.map.bounds lower_bound(%c0 : index) upper_bound(%7 : index) extent(%c10_1 : index) stride(%c1 : index) start_idx(%c1 : index) + %9 = omp.map.info var_ptr(%6#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(from) capture(ByRef) bounds(%8) -> !fir.ref> {name = "p"} + omp.target_data map_entries(%9 : !fir.ref>) { + %c1_2 = arith.constant 1 : index + %c0_3 = arith.constant 0 : index + %10 = arith.subi %c10, %c1_2 : index + %11 = omp.map.bounds lower_bound(%c0_3 : index) upper_bound(%10 : index) extent(%c10 : index) stride(%c1_2 : index) start_idx(%c1_2 : index) + %12 = omp.map.info var_ptr(%2#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(to) capture(ByRef) bounds(%11) -> !fir.ref> {name = "a"} + %c1_4 = arith.constant 1 : index + %c0_5 = arith.constant 0 : index + %13 = arith.subi %c10_1, %c1_4 : index + %14 = omp.map.bounds lower_bound(%c0_5 : index) upper_bound(%13 : index) extent(%c10_1 : index) stride(%c1_4 : index) start_idx(%c1_4 : index) + %15 = omp.map.info var_ptr(%6#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%14) -> !fir.ref> {name = "p"} + omp.target map_entries(%12 -> %arg3, %15 -> %arg4 : !fir.ref>, !fir.ref>) { + %c10_10 = arith.constant 10 : index + %22 = fir.shape %c10_10 : (index) -> !fir.shape<1> + %23:2 = hlfir.declare %arg3(%22) {uniq_name = "_QFtest_target_dataEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c10_11 = arith.constant 10 : index + %24 = fir.shape %c10_11 : (index) -> !fir.shape<1> + %25:2 = hlfir.declare %arg4(%24) {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c1_12 = arith.constant 1 : index + %26 = hlfir.designate %23#0 (%c1_12) {test.ptr = "targetArrayA"} : (!fir.ref>, index) -> !fir.ref + %27 = fir.load %26 : !fir.ref + %c1_13 = arith.constant 1 : index + %28 = hlfir.designate %25#0 (%c1_13) {test.ptr = "targetArrayP"} : (!fir.ref>, index) -> !fir.ref + hlfir.assign %27 to %28 : f32, !fir.ref + omp.terminator + } + %c1_6 = arith.constant 1 : index + %c0_7 = arith.constant 0 : index + %16 = arith.subi %c10_0, %c1_6 : index + %17 = omp.map.bounds lower_bound(%c0_7 : index) upper_bound(%16 : index) extent(%c10_0 : index) stride(%c1_6 : index) start_idx(%c1_6 : index) + %18 = omp.map.info var_ptr(%4#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(to) capture(ByRef) bounds(%17) -> !fir.ref> {name = "b"} + %c1_8 = arith.constant 1 : index + %c0_9 = arith.constant 0 : index + %19 = arith.subi %c10_1, %c1_8 : index + %20 = omp.map.bounds lower_bound(%c0_9 : index) upper_bound(%19 : index) extent(%c10_1 : index) stride(%c1_8 : index) start_idx(%c1_8 : index) + %21 = omp.map.info var_ptr(%6#1 : !fir.ref>, !fir.array<10xf32>) map_clauses(implicit, tofrom) capture(ByRef) bounds(%20) -> !fir.ref> {name = "p"} + omp.target map_entries(%18 -> %arg3, %21 -> %arg4 : !fir.ref>, !fir.ref>) { + %c10_10 = arith.constant 10 : index + %22 = fir.shape %c10_10 : (index) -> !fir.shape<1> + %23:2 = hlfir.declare %arg3(%22) {uniq_name = "_QFtest_target_dataEb"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c10_11 = arith.constant 10 : index + %24 = fir.shape %c10_11 : (index) -> !fir.shape<1> + %25:2 = hlfir.declare %arg4(%24) {uniq_name = "_QFtest_target_dataEp"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c1_12 = arith.constant 1 : index + %26 = hlfir.designate %23#0 (%c1_12) {test.ptr = "targetArrayB"} : (!fir.ref>, index) -> !fir.ref + %27 = fir.load %26 : !fir.ref + %c1_13 = arith.constant 1 : index + %28 = hlfir.designate %25#0 (%c1_13) {test.ptr = "targetArrayP"} : (!fir.ref>, index) -> !fir.ref + hlfir.assign %27 to %28 : f32, !fir.ref + omp.terminator + } + omp.terminator + } + return +} + -- GitLab From f74f568b29885c3fa63c44e33f91f3bb7281138e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= Date: Fri, 11 Oct 2024 11:58:14 +0200 Subject: [PATCH 017/345] [clang][analyzer] PointerSubChecker should not warn on pointers converted to numerical type (#111846) Pointer values casted to integer (non-pointer) type should be able to be subtracted as usual. --- .../StaticAnalyzer/Checkers/PointerSubChecker.cpp | 4 ++++ clang/test/Analysis/pointer-sub.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp index f0dc5efd75f7..7a85d9e20730 100644 --- a/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/PointerSubChecker.cpp @@ -61,6 +61,10 @@ void PointerSubChecker::checkPreStmt(const BinaryOperator *B, if (LR->getSymbolicBase() || RR->getSymbolicBase()) return; + if (!B->getLHS()->getType()->isPointerType() || + !B->getRHS()->getType()->isPointerType()) + return; + const auto *ElemLR = dyn_cast(LR); const auto *ElemRR = dyn_cast(RR); diff --git a/clang/test/Analysis/pointer-sub.c b/clang/test/Analysis/pointer-sub.c index 1c9d676ebb8f..25fb7f043d46 100644 --- a/clang/test/Analysis/pointer-sub.c +++ b/clang/test/Analysis/pointer-sub.c @@ -1,5 +1,7 @@ // RUN: %clang_analyze_cc1 -analyzer-checker=security.PointerSub -analyzer-output=text-minimal -verify %s +typedef int * Ptr; + void f1(void) { int x, y, z[10]; int d = &y - &x; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} @@ -10,6 +12,12 @@ void f1(void) { d = &x - (&x + 1); // no-warning d = (&x + 0) - &x; // no-warning d = (z + 10) - z; // no-warning + d = (long long)&y - (long long)&x; // no-warning + long long l = 1; + d = l - (long long)&y; // no-warning + Ptr p1 = &x; + Ptr p2 = &y; + d = p1 - p2; // expected-warning{{Subtraction of two pointers that do not point into the same array is undefined behavior}} } void f2(void) { @@ -28,6 +36,10 @@ void f2(void) { d = (int *)((char *)(&a[4]) + sizeof(int)) - &a[4]; // no-warning (pointers into the same array data) d = (int *)((char *)(&a[4]) + 1) - &a[4]; // expected-warning{{Subtraction of two pointers that}} + + long long a1 = (long long)&a[1]; + long long b1 = (long long)&b[1]; + d = a1 - b1; } void f3(void) { -- GitLab From 6a65e98fa7901dc1de91172d065fafb16ce89d77 Mon Sep 17 00:00:00 2001 From: Yingwei Zheng Date: Fri, 11 Oct 2024 18:19:21 +0800 Subject: [PATCH 018/345] [InstCombine] Drop range attributes in `foldIsPowerOf2` (#111946) Fixes https://github.com/llvm/llvm-project/issues/111934. --- .../InstCombine/InstCombineAndOrXor.cpp | 18 ++++++++--- llvm/test/Transforms/InstCombine/ispow2.ll | 32 +++++++++++++++++++ 2 files changed, 45 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 688601a8ffa5..964616a4eb35 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -955,9 +955,11 @@ static Value *foldIsPowerOf2OrZero(ICmpInst *Cmp0, ICmpInst *Cmp1, bool IsAnd, } /// Reduce a pair of compares that check if a value has exactly 1 bit set. -/// Also used for logical and/or, must be poison safe. +/// Also used for logical and/or, must be poison safe if range attributes are +/// dropped. static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, - InstCombiner::BuilderTy &Builder) { + InstCombiner::BuilderTy &Builder, + InstCombinerImpl &IC) { // Handle 'and' / 'or' commutation: make the equality check the first operand. if (JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_NE) std::swap(Cmp0, Cmp1); @@ -971,7 +973,10 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_ULT, m_Intrinsic(m_Specific(X)), m_SpecificInt(2)))) { - Value *CtPop = Cmp1->getOperand(0); + auto *CtPop = cast(Cmp1->getOperand(0)); + // Drop range attributes and re-infer them in the next iteration. + CtPop->dropPoisonGeneratingAnnotations(); + IC.addToWorklist(CtPop); return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1)); } // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1 @@ -980,7 +985,10 @@ static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd, match(Cmp1, m_SpecificICmp(ICmpInst::ICMP_UGT, m_Intrinsic(m_Specific(X)), m_SpecificInt(1)))) { - Value *CtPop = Cmp1->getOperand(0); + auto *CtPop = cast(Cmp1->getOperand(0)); + // Drop range attributes and re-infer them in the next iteration. + CtPop->dropPoisonGeneratingAnnotations(); + IC.addToWorklist(CtPop); return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1)); } return nullptr; @@ -3375,7 +3383,7 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, if (Value *V = foldSignedTruncationCheck(LHS, RHS, I, Builder)) return V; - if (Value *V = foldIsPowerOf2(LHS, RHS, IsAnd, Builder)) + if (Value *V = foldIsPowerOf2(LHS, RHS, IsAnd, Builder, *this)) return V; if (Value *V = foldPowerOf2AndShiftedMask(LHS, RHS, IsAnd, Builder)) diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll index c21ad95f83a1..832c066370b0 100644 --- a/llvm/test/Transforms/InstCombine/ispow2.ll +++ b/llvm/test/Transforms/InstCombine/ispow2.ll @@ -1522,3 +1522,35 @@ define <2 x i1> @not_pow2_or_z_known_bits_fail_wrong_cmp(<2 x i32> %xin) { %r = icmp ugt <2 x i32> %cnt, ret <2 x i1> %r } + +; Make sure that range attributes on return values are dropped after merging these two icmps + +define i1 @has_single_bit(i32 %x) { +; CHECK-LABEL: @has_single_bit( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[POPCNT:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = icmp eq i32 [[POPCNT]], 1 +; CHECK-NEXT: ret i1 [[SEL]] +; +entry: + %cmp1 = icmp ne i32 %x, 0 + %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp2 = icmp ult i32 %popcnt, 2 + %sel = select i1 %cmp1, i1 %cmp2, i1 false + ret i1 %sel +} + +define i1 @has_single_bit_inv(i32 %x) { +; CHECK-LABEL: @has_single_bit_inv( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[POPCNT:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]]) +; CHECK-NEXT: [[SEL:%.*]] = icmp ne i32 [[POPCNT]], 1 +; CHECK-NEXT: ret i1 [[SEL]] +; +entry: + %cmp1 = icmp eq i32 %x, 0 + %popcnt = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x) + %cmp2 = icmp ugt i32 %popcnt, 1 + %sel = select i1 %cmp1, i1 true, i1 %cmp2 + ret i1 %sel +} -- GitLab From 65da32c634a8345fcbe021f69fc6a609d074c08c Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 11 Oct 2024 11:26:57 +0100 Subject: [PATCH 019/345] [LV] Account for any-of reduction when computing costs of blend phis. Any-of reductions are narrowed to i1. Update the legacy cost model to use the correct type when computing the cost of a phi that gets lowered to selects (BLEND). This fixes a divergence between legacy and VPlan-based cost models after 36fc291b6ec6d. Fixes https://github.com/llvm/llvm-project/issues/111874. --- .../Transforms/Vectorize/LoopVectorize.cpp | 24 ++- .../RISCV/blend-any-of-reduction-cost.ll | 167 ++++++++++++++++++ 2 files changed, 189 insertions(+), 2 deletions(-) create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 05dc58a42249..54f57fb0b6b5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6480,12 +6480,32 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, // Phi nodes in non-header blocks (not inductions, reductions, etc.) are // converted into select instructions. We require N - 1 selects per phi // node, where N is the number of incoming values. - if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) + if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) { + Type *ResultTy = Phi->getType(); + + // All instructions in an Any-of reduction chain are narrowed to bool. + // Check if that is the case for this phi node. + auto *HeaderUser = cast_if_present( + find_singleton(Phi->users(), [this](User *U, bool) -> User * { + auto *Phi = dyn_cast(U); + if (Phi && Phi->getParent() == TheLoop->getHeader()) + return Phi; + return nullptr; + })); + if (HeaderUser) { + auto &ReductionVars = Legal->getReductionVars(); + auto Iter = ReductionVars.find(HeaderUser); + if (Iter != ReductionVars.end() && + RecurrenceDescriptor::isAnyOfRecurrenceKind( + Iter->second.getRecurrenceKind())) + ResultTy = Type::getInt1Ty(Phi->getContext()); + } return (Phi->getNumIncomingValues() - 1) * TTI.getCmpSelInstrCost( - Instruction::Select, ToVectorTy(Phi->getType(), VF), + Instruction::Select, ToVectorTy(ResultTy, VF), ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF), CmpInst::BAD_ICMP_PREDICATE, CostKind); + } return TTI.getCFInstrCost(Instruction::PHI, CostKind); } diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll new file mode 100644 index 000000000000..7db47cb9171d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/blend-any-of-reduction-cost.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p loop-vectorize -S %s | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-gnu" + +; Test case for https://github.com/llvm/llvm-project/issues/111874. +define i32 @any_of_reduction_used_in_blend(ptr %src, i64 %N, i1 %c.0, i1 %c.1) #0 { +; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[ANY_OF_RED:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: br i1 [[C_0]], label %[[LOOP_LATCH]], label %[[ELSE_1:.*]] +; CHECK: [[ELSE_1]]: +; CHECK-NEXT: br i1 [[C_1]], label %[[LOOP_LATCH]], label %[[ELSE_2:.*]] +; CHECK: [[ELSE_2]]: +; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[SRC]], align 8 +; CHECK-NEXT: [[C_2:%.*]] = icmp eq ptr [[L]], null +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C_2]], i32 0, i32 [[ANY_OF_RED]] +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ANY_OF_RED_NEXT]] = phi i32 [ [[ANY_OF_RED]], %[[LOOP_HEADER]] ], [ [[ANY_OF_RED]], %[[ELSE_1]] ], [ [[SEL]], %[[ELSE_2]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP_HEADER]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[ANY_OF_RED_NEXT]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + br label %loop.header + +loop.header: + %any.of.red = phi i32 [ 0, %entry ], [ %any.of.red.next, %loop.latch ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %c.0, label %loop.latch, label %else.1 + +else.1: + br i1 %c.1, label %loop.latch, label %else.2 + +else.2: + %l = load ptr, ptr %src, align 8 + %c.2 = icmp eq ptr %l, null + %sel = select i1 %c.2, i32 0, i32 %any.of.red + br label %loop.latch + +loop.latch: + %any.of.red.next = phi i32 [ %any.of.red, %loop.header ], [ %any.of.red, %else.1 ], [ %sel, %else.2 ] + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop.header + +exit: + %res = phi i32 [ %any.of.red.next, %loop.latch ] + ret i32 %res +} + +define i32 @any_of_reduction_used_in_blend_with_mutliple_phis(ptr %src, i64 %N, i1 %c.0, i1 %c.1) #0 { +; CHECK-LABEL: define i32 @any_of_reduction_used_in_blend_with_mutliple_phis( +; CHECK-SAME: ptr [[SRC:%.*]], i64 [[N:%.*]], i1 [[C_0:%.*]], i1 [[C_1:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i1 [[C_0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i1 [[C_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, ptr [[SRC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PREDPHI:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = xor [[BROADCAST_SPLAT]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP7:%.*]] = xor [[BROADCAST_SPLAT2]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP6]], [[TMP7]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2p0.nxv2p0( [[BROADCAST_SPLAT4]], i32 8, [[TMP8]], poison) +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq [[WIDE_MASKED_GATHER]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = or [[VEC_PHI]], [[TMP9]] +; CHECK-NEXT: [[PREDPHI]] = select [[TMP8]], [[TMP10]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv2i1( [[PREDPHI]]) +; CHECK-NEXT: [[TMP13:%.*]] = freeze i1 [[TMP12]] +; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 0, i32 0 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[ANY_OF_RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ANY_OF_RED_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: br i1 [[C_0]], label %[[X_1:.*]], label %[[ELSE_1:.*]] +; CHECK: [[ELSE_1]]: +; CHECK-NEXT: br i1 [[C_1]], label %[[X_1]], label %[[ELSE_2:.*]] +; CHECK: [[ELSE_2]]: +; CHECK-NEXT: [[L:%.*]] = load ptr, ptr [[SRC]], align 8 +; CHECK-NEXT: [[C_2:%.*]] = icmp eq ptr [[L]], null +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[C_2]], i32 0, i32 [[ANY_OF_RED]] +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[X_1]]: +; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[ANY_OF_RED]], %[[LOOP_HEADER]] ], [ [[ANY_OF_RED]], %[[ELSE_1]] ] +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ANY_OF_RED_NEXT]] = phi i32 [ [[P]], %[[X_1]] ], [ [[SEL]], %[[ELSE_2]] ] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[ANY_OF_RED_NEXT]], %[[LOOP_LATCH]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + br label %loop.header + +loop.header: + %any.of.red = phi i32 [ 0, %entry ], [ %any.of.red.next, %loop.latch ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + br i1 %c.0, label %x.1, label %else.1 + +else.1: + br i1 %c.1, label %x.1, label %else.2 + +else.2: + %l = load ptr, ptr %src, align 8 + %c.2 = icmp eq ptr %l, null + %sel = select i1 %c.2, i32 0, i32 %any.of.red + br label %loop.latch + +x.1: + %p = phi i32 [ %any.of.red, %loop.header ], [ %any.of.red, %else.1 ] + br label %loop.latch + +loop.latch: + %any.of.red.next = phi i32 [ %p, %x.1 ], [ %sel, %else.2 ] + %iv.next = add i64 %iv, 1 + %ec = icmp eq i64 %iv.next, %N + br i1 %ec, label %exit, label %loop.header + +exit: + %res = phi i32 [ %any.of.red.next, %loop.latch ] + ret i32 %res +} + +attributes #0 = { "target-cpu"="sifive-p670" } +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;. -- GitLab From d941254da94c8a5897689a74012a57de279c2c9e Mon Sep 17 00:00:00 2001 From: David Spickett Date: Fri, 11 Oct 2024 11:00:07 +0000 Subject: [PATCH 020/345] [lldb][test] Fix var name typo in TestProcessSaveCoreMinidump --- .../TestProcessSaveCoreMinidump.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py index 4818dde4f3b8..808de687e6ea 100644 --- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py +++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py @@ -117,14 +117,14 @@ class ProcessSaveCoreMinidumpTestCase(TestBase): expected_number_of_threads = process.GetNumThreads() expected_threads = [] stacks_to_sp_map = {} - stakcs_to_registers_map = {} + stacks_to_registers_map = {} for thread_idx in range(process.GetNumThreads()): thread = process.GetThreadAtIndex(thread_idx) thread_id = thread.GetThreadID() expected_threads.append(thread_id) stacks_to_sp_map[thread_id] = thread.GetFrameAtIndex(0).GetSP() - stakcs_to_registers_map[thread_id] = thread.GetFrameAtIndex( + stacks_to_registers_map[thread_id] = thread.GetFrameAtIndex( 0 ).GetRegisters() @@ -138,7 +138,7 @@ class ProcessSaveCoreMinidumpTestCase(TestBase): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) self.runCmd(base_command + " --style=modified-memory '%s'" % (core_dirty)) @@ -149,7 +149,7 @@ class ProcessSaveCoreMinidumpTestCase(TestBase): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) self.runCmd(base_command + " --style=full '%s'" % (core_full)) @@ -160,7 +160,7 @@ class ProcessSaveCoreMinidumpTestCase(TestBase): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) options = lldb.SBSaveCoreOptions() @@ -178,7 +178,7 @@ class ProcessSaveCoreMinidumpTestCase(TestBase): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) options = lldb.SBSaveCoreOptions() @@ -195,7 +195,7 @@ class ProcessSaveCoreMinidumpTestCase(TestBase): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) # Minidump can now save full core files, but they will be huge and @@ -214,7 +214,7 @@ class ProcessSaveCoreMinidumpTestCase(TestBase): expected_modules, expected_threads, stacks_to_sp_map, - stakcs_to_registers_map, + stacks_to_registers_map, ) self.assertSuccess(process.Kill()) -- GitLab From 4451f9f812d458f6b53785b27869674caf01e67b Mon Sep 17 00:00:00 2001 From: Sebastian Kreutzer Date: Fri, 11 Oct 2024 07:11:03 -0400 Subject: [PATCH 021/345] [XRay] Fix LLVM include in xray_interface.cpp (#111978) Removes a dependency on LLVM in `xray_interface.cpp` by replacing `llvm_unreachable` with compiler-rt's `UNREACHABLE`. Applies clang-format to some unformatted changes. Original PR: #90959 --- clang/include/clang/Driver/XRayArgs.h | 4 +-- clang/lib/Driver/XRayArgs.cpp | 8 ++--- compiler-rt/include/xray/xray_interface.h | 40 +++++++++++++---------- compiler-rt/lib/xray/xray_interface.cpp | 5 ++- 4 files changed, 29 insertions(+), 28 deletions(-) diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h index 8fbcf469e5ba..1b5c4a4c42f1 100644 --- a/clang/include/clang/Driver/XRayArgs.h +++ b/clang/include/clang/Driver/XRayArgs.h @@ -36,9 +36,7 @@ public: llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const; bool needsXRayRt() const { return XRayInstrument && XRayRT; } - bool needsXRayDSORt() const { - return XRayInstrument && XRayRT && XRayShared; - } + bool needsXRayDSORt() const { return XRayInstrument && XRayRT && XRayShared; } llvm::ArrayRef modeList() const { return Modes; } XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; } }; diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index 411054e067cb..d0bb5d4887c1 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -63,8 +63,8 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) { << XRayInstrument->getSpelling() << Triple.str(); } - if (Args.hasFlag(options::OPT_fxray_shared, - options::OPT_fno_xray_shared, false)) { + if (Args.hasFlag(options::OPT_fxray_shared, options::OPT_fno_xray_shared, + false)) { XRayShared = true; // DSO instrumentation is currently limited to x86_64 @@ -75,8 +75,8 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) { unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args)); if (!PICLvl) { - D.Diag(diag::err_opt_not_valid_without_opt) - << "-fxray-shared" << "-fPIC"; + D.Diag(diag::err_opt_not_valid_without_opt) << "-fxray-shared" + << "-fPIC"; } } diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h index 717cfe292ce4..675ea0cbc48c 100644 --- a/compiler-rt/include/xray/xray_interface.h +++ b/compiler-rt/include/xray/xray_interface.h @@ -93,8 +93,8 @@ enum XRayPatchingStatus { FAILED = 3, }; -/// This tells XRay to patch the instrumentation points in all currently loaded objects. See XRayPatchingStatus -/// for possible result values. +/// This tells XRay to patch the instrumentation points in all currently loaded +/// objects. See XRayPatchingStatus for possible result values. extern XRayPatchingStatus __xray_patch(); /// This tells XRay to patch the instrumentation points in the given object. @@ -105,8 +105,8 @@ extern XRayPatchingStatus __xray_patch_object(int32_t ObjId); /// result values. extern XRayPatchingStatus __xray_unpatch(); -/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for possible -/// result values. +/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for +/// possible result values. extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); /// This unpacks the given (packed) function id and patches @@ -114,8 +114,8 @@ extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); /// result values. extern XRayPatchingStatus __xray_patch_function(int32_t FuncId); -/// This patches a specific function in the given object. See XRayPatchingStatus for possible -/// result values. +/// This patches a specific function in the given object. See XRayPatchingStatus +/// for possible result values. extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId, int32_t ObjId); @@ -129,26 +129,29 @@ extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId); extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId, int32_t ObjId); -/// This function unpacks the given (packed) function id and returns the address of the corresponding function. We return 0 if we encounter any error, even if 0 may be a valid function -/// address. +/// This function unpacks the given (packed) function id and returns the address +/// of the corresponding function. We return 0 if we encounter any error, even +/// if 0 may be a valid function address. extern uintptr_t __xray_function_address(int32_t FuncId); -/// This function returns the address of the function in the given object provided valid function and object -/// ids. We return 0 if we encounter any error, even if 0 may be a valid function -/// address. +/// This function returns the address of the function in the given object +/// provided valid function and object ids. We return 0 if we encounter any +/// error, even if 0 may be a valid function address. extern uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId); -/// This function returns the maximum valid function id for the main executable (object id = 0). Returns 0 if we -/// encounter errors (when there are no instrumented functions, etc.). +/// This function returns the maximum valid function id for the main executable +/// (object id = 0). Returns 0 if we encounter errors (when there are no +/// instrumented functions, etc.). extern size_t __xray_max_function_id(); -/// This function returns the maximum valid function id for the given object. Returns 0 if we -/// encounter errors (when there are no instrumented functions, etc.). +/// This function returns the maximum valid function id for the given object. +/// Returns 0 if we encounter errors (when there are no instrumented functions, +/// etc.). extern size_t __xray_max_function_id_in_object(int32_t ObjId); -/// This function returns the number of previously registered objects (executable + loaded DSOs). -/// Returns 0 if XRay has not been initialized. +/// This function returns the number of previously registered objects +/// (executable + loaded DSOs). Returns 0 if XRay has not been initialized. extern size_t __xray_num_objects(); /// Unpacks the function id from the given packed id. @@ -158,7 +161,8 @@ extern int32_t __xray_unpack_function_id(int32_t PackedId); extern int32_t __xray_unpack_object_id(int32_t PackedId); /// Creates and returns a packed id from the given function and object ids. -/// If the ids do not fit within the reserved number of bits for each part, the high bits are truncated. +/// If the ids do not fit within the reserved number of bits for each part, the +/// high bits are truncated. extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId); /// Initialize the required XRay data structures. This is useful in cases where diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 16e60bfc22cd..402fc3d07b4e 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "xray_interface_internal.h" -#include "llvm/Support/ErrorHandling.h" #include #include @@ -411,9 +410,9 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { CombinedStatus = NOT_INITIALIZED; break; case ONGOING: - llvm_unreachable("Status ONGOING should not appear at this point"); + UNREACHABLE("Status ONGOING should not appear at this point"); default: - llvm_unreachable("Unhandled patching status"); + UNREACHABLE("Unhandled patching status"); } } return CombinedStatus; -- GitLab From 0163ac1f53abc0a0f6e5b7e56912c1dee67e7f32 Mon Sep 17 00:00:00 2001 From: Mats Petersson Date: Fri, 11 Oct 2024 12:23:37 +0100 Subject: [PATCH 022/345] [Flang][OpenMP]Add tests for TODOs and small changes to improve messages (#111562) The bulk of this change are new tests to check that we get a "Not yet implemneted: *some stuff here*" message when using some not yet supported OpenMP functionality. For some of these cases, this also means adding additional clauses to a filter list in OpenMP.cpp - this changes nothing [to the best of my understanding] other than allowing the clause to get to the point where it can be rejected in a TODO with a more clear message. One of the TOOD filters were missing Mergeable clause, so this was also added and the existing test updated for the new more specific error message. There is no functional change intended here. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 9 ++++++--- flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 | 14 ++++++++++++++ flang/test/Lower/OpenMP/Todo/reduction-task.f90 | 12 ++++++++++++ .../test/Lower/OpenMP/Todo/target-inreduction.f90 | 15 +++++++++++++++ flang/test/Lower/OpenMP/Todo/task-inreduction.f90 | 15 +++++++++++++++ flang/test/Lower/OpenMP/Todo/task_mergeable.f90 | 2 +- .../OpenMP/Todo/taskgroup-task-reduction.f90 | 10 ++++++++++ flang/test/Lower/OpenMP/Todo/taskloop.f90 | 13 +++++++++++++ flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 | 10 ++++++++++ flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 | 8 ++++++++ 10 files changed, 104 insertions(+), 4 deletions(-) create mode 100644 flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/reduction-task.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/target-inreduction.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/task-inreduction.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskloop.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 create mode 100644 flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 0735e40ea2ca..a89029b720e7 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1310,8 +1310,8 @@ static void genTaskClauses(lower::AbstractConverter &converter, cp.processUntied(clauseOps); // TODO Support delayed privatization. - cp.processTODO( - loc, llvm::omp::Directive::OMPD_task); + cp.processTODO(loc, llvm::omp::Directive::OMPD_task); } static void genTaskgroupClauses(lower::AbstractConverter &converter, @@ -2780,7 +2780,10 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, !std::holds_alternative(clause.u) && !std::holds_alternative(clause.u) && !std::holds_alternative(clause.u) && - !std::holds_alternative(clause.u)) { + !std::holds_alternative(clause.u) && + !std::holds_alternative(clause.u) && + !std::holds_alternative(clause.u) && + !std::holds_alternative(clause.u)) { TODO(clauseLocation, "OpenMP Block construct clause"); } } diff --git a/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 b/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 new file mode 100644 index 000000000000..c5f196fe0969 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/reduction-inscan.f90 @@ -0,0 +1,14 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Reduction modifiers are not supported +subroutine reduction_inscan() + integer :: i,j + i = 0 + + !$omp do reduction(inscan, +:i) + do j=1,10 + i = i + 1 + end do + !$omp end do +end subroutine reduction_inscan diff --git a/flang/test/Lower/OpenMP/Todo/reduction-task.f90 b/flang/test/Lower/OpenMP/Todo/reduction-task.f90 new file mode 100644 index 000000000000..6707f65e1a4c --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/reduction-task.f90 @@ -0,0 +1,12 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Reduction modifiers are not supported +subroutine reduction_task() + integer :: i + i = 0 + + !$omp parallel reduction(task, +:i) + i = i + 1 + !$omp end parallel +end subroutine reduction_task diff --git a/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 new file mode 100644 index 000000000000..e5a9cffac5a1 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/target-inreduction.f90 @@ -0,0 +1,15 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 -o - %s 2>&1 | FileCheck %s + +!=============================================================================== +! `mergeable` clause +!=============================================================================== + +! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TARGET construct +subroutine omp_target_inreduction() + integer i + i = 0 + !$omp target in_reduction(+:i) + i = i + 1 + !$omp end target +end subroutine omp_target_inreduction diff --git a/flang/test/Lower/OpenMP/Todo/task-inreduction.f90 b/flang/test/Lower/OpenMP/Todo/task-inreduction.f90 new file mode 100644 index 000000000000..aeed680a6dba --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/task-inreduction.f90 @@ -0,0 +1,15 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s 2>&1 | FileCheck %s + +!=============================================================================== +! `mergeable` clause +!=============================================================================== + +! CHECK: not yet implemented: Unhandled clause IN_REDUCTION in TASK construct +subroutine omp_task_in_reduction() + integer i + i = 0 + !$omp task in_reduction(+:i) + i = i + 1 + !$omp end task +end subroutine omp_task_in_reduction diff --git a/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 b/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 index 13145d92ccf9..ddc27487abfe 100644 --- a/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 +++ b/flang/test/Lower/OpenMP/Todo/task_mergeable.f90 @@ -5,7 +5,7 @@ ! `mergeable` clause !=============================================================================== -! CHECK: not yet implemented: OpenMP Block construct clause +! CHECK: not yet implemented: Unhandled clause MERGEABLE in TASK construct subroutine omp_task_mergeable() !$omp task mergeable call foo() diff --git a/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 b/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 new file mode 100644 index 000000000000..1cb471d784d7 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskgroup-task-reduction.f90 @@ -0,0 +1,10 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause TASK_REDUCTION in TASKGROUP construct +subroutine omp_taskgroup_task_reduction + integer :: res + !$omp taskgroup task_reduction(+:res) + res = res + 1 + !$omp end taskgroup +end subroutine omp_taskgroup_task_reduction diff --git a/flang/test/Lower/OpenMP/Todo/taskloop.f90 b/flang/test/Lower/OpenMP/Todo/taskloop.f90 new file mode 100644 index 000000000000..aca050584cbb --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskloop.f90 @@ -0,0 +1,13 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Taskloop construct +subroutine omp_taskloop + integer :: res, i + !$omp taskloop + do i = 1, 10 + res = res + 1 + end do + !$omp end taskloop +end subroutine omp_taskloop + diff --git a/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 b/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 new file mode 100644 index 000000000000..d1f953be8802 --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskwait-depend.f90 @@ -0,0 +1,10 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=50 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause DEPEND in TASKWAIT construct +subroutine omp_tw_depend + integer :: res + !$omp taskwait depend(out: res) + res = res + 1 +end subroutine omp_tw_depend + diff --git a/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 b/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 new file mode 100644 index 000000000000..21e8609b08ba --- /dev/null +++ b/flang/test/Lower/OpenMP/Todo/taskwait-nowait.f90 @@ -0,0 +1,8 @@ +! RUN: %not_todo_cmd bbc -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s +! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -o - %s -fopenmp-version=51 2>&1 | FileCheck %s + +! CHECK: not yet implemented: Unhandled clause NOWAIT in TASKWAIT construct +subroutine omp_tw_nowait + !$omp taskwait nowait +end subroutine omp_tw_nowait + -- GitLab From b5ea5be2a714e28bac57d417c221f687efe396bf Mon Sep 17 00:00:00 2001 From: Sam Elliott Date: Fri, 11 Oct 2024 13:24:54 +0200 Subject: [PATCH 023/345] [RISCV][MC] Fix >32bit .insn Directives (#111878) The original patch had a reasonably significant bug. You could not use `.insn` to assemble encodings that had any bits set above the low 32 bits. This is due to the fact that `getMachineOpValue` was truncating the immediate value, and I did not commit enough tests of useful cases. This changes the result of `getMachineOpValue` to be able to return the 48-bit and 64-bit immediates needed for the wider `.insn` directives. I took the opportunity to move some of the test cases around in the file to make looking at the output of `llvm-objdump` a little clearer. --- .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp | 6 ++-- llvm/test/MC/RISCV/insn.s | 35 +++++++++++++++---- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp index 66970ed37f27..54f1a3899c49 100644 --- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp +++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp @@ -77,7 +77,7 @@ public: /// Return binary encoding of operand. If the machine operand requires /// relocation, record the relocation and return zero. - unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO, + uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; @@ -375,7 +375,7 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, ++MCNumEmitted; // Keep track of the # of mi's emitted. } -unsigned +uint64_t RISCVMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const { @@ -384,7 +384,7 @@ RISCVMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO, return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()); if (MO.isImm()) - return static_cast(MO.getImm()); + return MO.getImm(); llvm_unreachable("Unhandled expression!"); return 0; diff --git a/llvm/test/MC/RISCV/insn.s b/llvm/test/MC/RISCV/insn.s index e32fec25bb16..d24f4fe8b363 100644 --- a/llvm/test/MC/RISCV/insn.s +++ b/llvm/test/MC/RISCV/insn.s @@ -170,17 +170,40 @@ target: # CHECK-OBJ: .insn 6, 0x1f -# CHECK-ASM: .insn 0x4, 65503 -# CHECK-ASM: encoding: [0xdf,0xff,0x00,0x00] -# CHECK-OBJ: -.insn 0xffdf - # CHECK-ASM: .insn 0x8, 63 # CHECK-ASM: encoding: [0x3f,0x00,0x00,0x00,0x00,0x00,0x00,0x00] # CHECK-OBJ: .insn 8, 0x3f +# CHECK-ASM: .insn 0x6, 281474976710623 +# CHECK-ASM: encoding: [0xdf,0xff,0xff,0xff,0xff,0xff] +# CHECK-OBJ: +.insn 0x6, 0xffffffffffdf + +# CHECK-ASM: .insn 0x8, -65 +# CHECK-ASM: encoding: [0xbf,0xff,0xff,0xff,0xff,0xff,0xff,0xff] +# CHECK-OBJ: +.insn 0x8, 0xffffffffffffffbf + +odd_lengths: +# CHECK-ASM-LABEL: odd_lengths: +# CHECK-OBJ-LABEL: : + +## These deliberately disagree with the lengths objdump expects them to have, so +## keep them at the end so that the disassembled instruction stream is not out +## of sync with the encoded instruction stream. We don't check for `` +## as we could get any number of those, so instead check for the encoding +## halfwords. These might be split into odd 16-bit chunks, so each chunk is on +## one line. + +# CHECK-ASM: .insn 0x4, 65503 +# CHECK-ASM: encoding: [0xdf,0xff,0x00,0x00] +# CHECK-OBJ: ffdf +# CHECK-OBJ: 0000 +.insn 0xffdf + # CHECK-ASM: .insn 0x4, 65471 # CHECK-ASM: encoding: [0xbf,0xff,0x00,0x00] -# CHECK-OBJ: +# CHECK-OBJ: ffbf +# CHECK-OBJ: 0000 .insn 0xffbf -- GitLab From 303c8d20601d810c177f6646f771c1eb3f29ab8c Mon Sep 17 00:00:00 2001 From: Rin Dobrescu Date: Fri, 11 Oct 2024 12:29:44 +0100 Subject: [PATCH 024/345] [AArch64] Add SchedReadAdvance to Neoverse-V1 scheduling model. (#111538) Introduce a description of late forwarding to the Neoverse-V1 Scheduling model. --- .../Target/AArch64/AArch64SchedNeoverseV1.td | 207 ++- .../llvm-mca/AArch64/Neoverse/V1-forwarding.s | 1421 +++++++++++++++++ .../AArch64/Neoverse/V1-neon-instructions.s | 138 +- 3 files changed, 1645 insertions(+), 121 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td index f7e6545f0dd3..fb4d2f3d7bcd 100644 --- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td +++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td @@ -469,6 +469,89 @@ def V1Write_11c_9L01_9S_9V : SchedWriteRes<[V1UnitL01, V1UnitL01, V1UnitL01, V1UnitV, V1UnitV, V1UnitV, V1UnitV, V1UnitV, V1UnitV]>; +//===----------------------------------------------------------------------===// +// Define forwarded types + +// NOTE: SOG, p. 20, n. 2: Accumulator forwarding is not supported for +// consumers of 64 bit multiply high operations? +def V1Wr_IM : SchedWriteRes<[V1UnitM]> { let Latency = 2; } +def V1Wr_IMA : SchedWriteRes<[V1UnitM0]> { let Latency = 2; } +def V1WriteIM : SchedWriteVariant< + [SchedVar, + SchedVar]>; +def V1Rd_IMA : SchedReadAdvance<1, [V1Wr_IMA]>; + +def V1Wr_FMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FMA : SchedReadAdvance<2, [WriteFMul, V1Wr_FMA]>; + +def V1Wr_ADA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; } +def V1Rd_ADA : SchedReadAdvance<3, [V1Wr_ADA]>; + +def V1Wr_VDOT : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_VDOT : SchedReadAdvance<2, [V1Wr_VDOT]>; + +def V1Wr_VMMA : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_VMMA : SchedReadAdvance<2, [V1Wr_VMMA]>; + +def V1Wr_VMA : SchedWriteRes<[V1UnitV02]> { let Latency = 4; } +def V1Rd_VMA : SchedReadAdvance<3, [V1Wr_VMA]>; + +def V1Wr_VMAL : SchedWriteRes<[V1UnitV02]> { let Latency = 4; } +def V1Rd_VMAL : SchedReadAdvance<3, [V1Wr_VMAL]>; + +def V1Wr_VSA : SchedWriteRes<[V1UnitV13]> { let Latency = 4; } +def V1Rd_VSA : SchedReadAdvance<3, [V1Wr_VSA]>; + +def V1Wr_FCMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FCMA : SchedReadAdvance<2, [V1Wr_FCMA]>; + +def V1Wr_FPM : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Wr_FPMA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_FPMA : SchedReadAdvance<2, [V1Wr_FPM, V1Wr_FPMA]>; + +def V1Wr_FPMAL : SchedWriteRes<[V1UnitV]> { let Latency = 5; } +def V1Rd_FPMAL : SchedReadAdvance<3, [V1Wr_FPMAL]>; + +def V1Wr_BFD : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_BFD : SchedReadAdvance<2, [V1Wr_BFD]>; + +def V1Wr_BFMMA : SchedWriteRes<[V1UnitV]> { let Latency = 5; } +def V1Rd_BFMMA : SchedReadAdvance<2, [V1Wr_BFMMA]>; + +def V1Wr_BFMLA : SchedWriteRes<[V1UnitV]> { let Latency = 4; } +def V1Rd_BFMLA : SchedReadAdvance<2, [V1Wr_BFMLA]>; + +def V1Wr_CRC : SchedWriteRes<[V1UnitM0]> { let Latency = 2; } +def V1Rd_CRC : SchedReadAdvance<1, [V1Wr_CRC]>; + +def V1Wr_ZDOTB : SchedWriteRes<[V1UnitV01]> { let Latency = 3; } +def V1Rd_ZDOTB : SchedReadAdvance<2, [V1Wr_ZDOTB]>; + +def V1Wr_ZUDOTB : SchedWriteRes<[V1UnitV]> { let Latency = 3; } +def V1Rd_ZUDOTB : SchedReadAdvance<2, [V1Wr_ZUDOTB]>; + +def V1Wr_ZDOTH : SchedWriteRes<[V1UnitV0]> { let Latency = 4; } +def V1Rd_ZDOTH : SchedReadAdvance<3, [V1Wr_ZDOTH]>; + +def V1Wr_ZMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 3; } +def V1Rd_ZMMA : SchedReadAdvance<2, [V1Wr_ZMMA]>; + +let Latency = 5, NumMicroOps = 2 in +def V1Wr_ZMAD : SchedWriteRes<[V1UnitV0, V1UnitV0]>; +def V1Rd_ZMAD : SchedReadAdvance<3, [V1Wr_ZMAD]>; + +def V1Wr_ZFCMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZFCMA : SchedReadAdvance<3, [V1Wr_ZFCMA]>; + +def V1Wr_ZFMA : SchedWriteRes<[V1UnitV01]> { let Latency = 4; } +def V1Rd_ZFMA : SchedReadAdvance<2, [V1Wr_ZFMA]>; + +def V1Wr_ZBFDOT : SchedWriteRes<[V1UnitV01]> { let Latency = 4; } +def V1Rd_ZBFDOT : SchedReadAdvance<2, [V1Wr_ZBFDOT]>; +def V1Wr_ZBFMMA : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZBFMMA : SchedReadAdvance<2, [V1Wr_ZBFMMA]>; +def V1Wr_ZBFMAL : SchedWriteRes<[V1UnitV01]> { let Latency = 5; } +def V1Rd_ZBFMAL : SchedReadAdvance<3, [V1Wr_ZBFMAL]>; // Miscellaneous Instructions // ----------------------------------------------------------------------------- @@ -553,16 +636,19 @@ def : InstRW<[V1Write_1c_1J], (instrs SETF8, SETF16, RMIF, CFINV)>; def : SchedAlias; def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + // Multiply -// Multiply accumulate -// Multiply accumulate, long -// Multiply long -def V1WriteIM : SchedWriteVariant< - [SchedVar, - SchedVar]>; -def : SchedAlias; -def : SchedAlias; +// Multiply accumulate, W-form +// Multiply accumulate, X-form +def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA], + (instregex "^M(ADD|SUB)[WX]rrr$")>; +// Multiply accumulate long +// Multiply long +def : InstRW<[V1WriteIM, ReadIM, ReadIM, V1Rd_IMA], + (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; // Multiply high def : InstRW<[V1Write_3c_1M, ReadIM, ReadIM], (instrs SMULHrr, UMULHrr)>; @@ -680,10 +766,11 @@ def : InstRW<[V1Write_15c7_1V02], (instrs FDIVDrr)>; def : InstRW<[V1Write_16c7_1V02], (instrs FSQRTDr)>; // FP multiply -def : SchedAlias; +def : WriteRes { let Latency = 3; } // FP multiply accumulate -def : InstRW<[V1Write_4c_1V], (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; +def : InstRW<[V1Wr_FMA, ReadDefault, ReadDefault, V1Rd_FMA], + (instregex "^FN?M(ADD|SUB)[HSD]rrr$")>; // FP round to integral def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ][HSD]r$", @@ -824,7 +911,7 @@ def : SchedAlias; // ASIMD absolute diff accum // ASIMD absolute diff accum long // ASIMD pairwise add and accumulate long -def : InstRW<[V1Write_4c_1V13], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>; +def : InstRW<[V1Wr_ADA, V1Rd_ADA], (instregex "^[SU]ABAL?v", "^[SU]ADALPv")>; // ASIMD arith, reduce, 4H/4S // ASIMD max/min, reduce, 4H/4S @@ -843,23 +930,26 @@ def : InstRW<[V1Write_4c_2V13], (instregex "^(ADD|[SU]ADDL)Vv16i8v$", // ASIMD dot product // ASIMD dot product using signed and unsigned integers -def : InstRW<[V1Write_2c_1V], (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>; +def : InstRW<[V1Wr_VDOT, V1Rd_VDOT], + (instregex "^([SU]|SU|US)DOT(lane)?v(8|16)i8$")>; -// ASIMD matrix multiply- accumulate -def : InstRW<[V1Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>; +// ASIMD matrix multiply-accumulate +def : InstRW<[V1Wr_VMMA, V1Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>; // ASIMD multiply +def : InstRW<[V1Write_4c_1V02], (instregex "^MULv", "^SQ(R)?DMULHv")>; + // ASIMD multiply accumulate +def : InstRW<[V1Wr_VMA, V1Rd_VMA], (instregex "^MLAv", "^MLSv")>; + // ASIMD multiply accumulate long +def : InstRW<[V1Wr_VMAL, V1Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>; + // ASIMD multiply accumulate high +def : InstRW<[V1Write_4c_1V02], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>; + // ASIMD multiply accumulate saturating long -def : InstRW<[V1Write_4c_1V02], - (instregex "^MUL(v[148]i16|v[124]i32)$", - "^SQR?DMULH(v[48]i16|v[24]i32)$", - "^ML[AS](v[148]i16|v[124]i32)$", - "^[SU]ML[AS]Lv", - "^SQRDML[AS]H(v[148]i16|v[124]i32)$", - "^SQDML[AS]Lv")>; +def : InstRW<[V1Write_4c_1V02], (instregex "^SQDML[AS]L[iv]")>; // ASIMD multiply/multiply long (8x8) polynomial def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>; @@ -868,11 +958,12 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^PMULL?v(8|16)i8$")>; def : InstRW<[V1Write_3c_1V02], (instregex "^([SU]|SQD)MULLv")>; // ASIMD shift accumulate +def : InstRW<[V1Wr_VSA, V1Rd_VSA], (instregex "^[SU]SRAv", "^[SU]RSRAv")>; + // ASIMD shift by immed, complex // ASIMD shift by register, complex def : InstRW<[V1Write_4c_1V13], - (instregex "^[SU]R?SRAv", - "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$", + (instregex "^RSHRNv", "^SQRSHRU?Nv", "^(SQSHLU?|UQSHL)[bhsd]$", "^(SQSHLU?|UQSHL)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32|v2i64)_shift$", "^SQSHU?RNv", "^[SU]RSHRv", "^UQR?SHRNv", "^[SU]Q?RSHLv", "^[SU]QSHLv")>; @@ -890,16 +981,25 @@ def : InstRW<[V1Write_2c_1V13], (instregex "^SHLL?v", "^SHRNv", "^[SU]SHLLv", // ASIMD FP absolute value/difference // ASIMD FP arith, normal // ASIMD FP compare -// ASIMD FP complex add // ASIMD FP max/min, normal // ASIMD FP max/min, pairwise // ASIMD FP negate // Covered by "SchedAlias (WriteV[dq]...)" above +// ASIMD FP complex add +def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$")>; + // ASIMD FP complex multiply add +def : InstRW<[V1Wr_FCMA, V1Rd_FCMA], (instregex "^FCMLAv")>; + +// ASIMD FP multiply +def : InstRW<[V1Wr_FPM], (instregex "^FMULX?v")>; + // ASIMD FP multiply accumulate -def : InstRW<[V1Write_4c_1V], (instregex "^FCADD(v[48]f16|v[24]f32|v2f64)$", - "^FML[AS]v")>; +def : InstRW<[V1Wr_FPMA, V1Rd_FPMA], (instregex "^FML[AS]v")>; + +// ASIMD FP multiply accumulate long +def : InstRW<[V1Wr_FPMAL, V1Rd_FPMAL], (instregex "^FML[AS]L2?v")>; // ASIMD FP convert, long (F16 to F32) def : InstRW<[V1Write_4c_2V02], (instregex "^FCVTLv[48]i16$")>; @@ -953,12 +1053,6 @@ def : InstRW<[V1Write_4c_2V], (instregex "^F(MAX|MIN)(NM)?Vv4(i16|i32)v$")>; // ASIMD FP max/min, reduce, Q-form F16 def : InstRW<[V1Write_6c_3V], (instregex "^F(MAX|MIN)(NM)?Vv8i16v$")>; -// ASIMD FP multiply -def : InstRW<[V1Write_3c_1V], (instregex "^FMULX?v")>; - -// ASIMD FP multiply accumulate long -def : InstRW<[V1Write_5c_1V], (instregex "^FML[AS]L2?v")>; - // ASIMD FP round, D-form F32 and Q-form F64 def : InstRW<[V1Write_3c_1V02], (instregex "^FRINT[AIMNPXZ]v2f(32|64)$")>; @@ -976,13 +1070,13 @@ def : InstRW<[V1Write_6c_4V02], (instregex "^FRINT[AIMNPXZ]v8f16$")>; def : InstRW<[V1Write_4c_1V02], (instrs BFCVTN, BFCVTN2)>; // ASIMD dot product -def : InstRW<[V1Write_4c_1V], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>; +def : InstRW<[V1Wr_BFD, V1Rd_BFD], (instregex "^BF(DOT|16DOTlane)v[48]bf16$")>; // ASIMD matrix multiply accumulate -def : InstRW<[V1Write_5c_1V], (instrs BFMMLA)>; +def : InstRW<[V1Wr_BFMMA, V1Rd_BFMMA], (instrs BFMMLA)>; // ASIMD multiply accumulate long -def : InstRW<[V1Write_4c_1V], (instregex "^BFMLAL[BT](Idx)?$")>; +def : InstRW<[V1Wr_BFMLA, V1Rd_BFMLA], (instregex "^BFMLAL[BT](Idx)?$")>; // Scalar convert, F32 to BF16 def : InstRW<[V1Write_3c_1V02], (instrs BFCVT)>; @@ -1300,7 +1394,7 @@ def : InstRW<[V1Write_2c_1V0], (instrs BCAX, EOR3, RAX1, XAR)>; // ----------------------------------------------------------------------------- // CRC checksum ops -def : InstRW<[V1Write_2c_1M0], (instregex "^CRC32C?[BHWX]rr$")>; +def : InstRW<[V1Wr_CRC, V1Rd_CRC], (instregex "^CRC32C?[BHWX]rr$")>; // SVE Predicate instructions @@ -1440,13 +1534,14 @@ def : InstRW<[V1Write_20c7_1V0], (instregex "^[SU]DIVR?_ZPmZ_D", "^[SU]DIV_ZPZZ_D")>; // Dot product, 8 bit -def : InstRW<[V1Write_3c_1V01], (instregex "^[SU]DOT_ZZZI?_S$")>; +def : InstRW<[V1Wr_ZDOTB, V1Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_S$")>; // Dot product, 8 bit, using signed and unsigned integers -def : InstRW<[V1Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>; +def : InstRW<[V1Wr_ZUDOTB, V1Rd_ZUDOTB], + (instrs SUDOT_ZZZI, USDOT_ZZZ, USDOT_ZZZI)>; // Dot product, 16 bit -def : InstRW<[V1Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_D$")>; +def : InstRW<[V1Wr_ZDOTH, V1Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_D$")>; // Duplicate, immediate and indexed form def : InstRW<[V1Write_2c_1V01], (instregex "^DUP_ZI_[BHSD]$", @@ -1488,7 +1583,7 @@ def : InstRW<[V1Write_2c_1V01], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$", "^MOVPRFX_ZZ$")>; // Matrix multiply-accumulate -def : InstRW<[V1Write_3c_1V01], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; +def : InstRW<[V1Wr_ZMMA, V1Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>; // Multiply, B, H, S element size def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", @@ -1497,12 +1592,16 @@ def : InstRW<[V1Write_4c_1V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_[BHS]", "^[SU]MULH_ZPZZ_[BHS]")>; // Multiply, D element size -// Multiply accumulate, D element size def : InstRW<[V1Write_5c_2V0], (instregex "^MUL_(ZI|ZPmZ|ZZZI|ZZZ)_D", "^MUL_ZPZZ_D", "^[SU]MULH_(ZPmZ|ZZZ)_D", - "^[SU]MULH_ZPZZ_D", - "^(MLA|MLS|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>; + "^[SU]MULH_ZPZZ_D")>; + +// Multiply accumulate, D element size +def : InstRW<[V1Wr_ZMAD, V1Rd_ZMAD], + (instregex "^ML[AS]_ZPZZZ_D")>; +def : InstRW<[V1Wr_ZMAD, ReadDefault, V1Rd_ZMAD], + (instregex "^(ML[AS]|MAD|MSB)_ZPmZZ_D")>; // Multiply accumulate, B, H, S element size // NOTE: This is not specified in the SOG. @@ -1583,8 +1682,8 @@ def : InstRW<[V1Write_2c_1V0], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$", def : InstRW<[V1Write_3c_1V01], (instregex "^FCADD_ZPmZ_[HSD]$")>; // Floating point complex multiply add -def : InstRW<[V1Write_5c_1V01], (instregex "^FCMLA_ZPmZZ_[HSD]$", - "^FCMLA_ZZZI_[HS]$")>; +def : InstRW<[V1Wr_ZFCMA, ReadDefault, V1Rd_ZFCMA], (instregex "^FCMLA_ZPmZZ_[HSD]")>; +def : InstRW<[V1Wr_ZFCMA, V1Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>; // Floating point convert, long or narrow (F16 to F32 or F32 to F16) // Floating point convert to integer, F32 @@ -1623,11 +1722,15 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]", "^FMUL_ZPZ[IZ]_[HSD]")>; // Floating point multiply accumulate +def : InstRW<[V1Wr_ZFMA, ReadDefault, V1Rd_ZFMA], + (instregex "^FN?ML[AS]_ZPmZZ_[HSD]", + "^FN?(MAD|MSB)_ZPmZZ_[HSD]")>; +def : InstRW<[V1Wr_ZFMA, V1Rd_ZFMA], + (instregex "^FML[AS]_ZZZI_[HSD]", + "^FN?ML[AS]_ZPZZZ_[HSD]")>; + // Floating point reciprocal step -def : InstRW<[V1Write_4c_1V01], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$", - "^FN?ML[AS]_ZPZZZ_[HSD]", - "^FML[AS]_ZZZI_[HSD]$", - "^F(RECPS|RSQRTS)_ZZZ_[HSD]$")>; +def : InstRW<[V1Write_4c_1V01], (instregex "^F(RECPS|RSQRTS)_ZZZ_[HSD]")>; // Floating point reciprocal estimate, F16 def : InstRW<[V1Write_6c_4V0], (instrs FRECPE_ZZ_H, FRSQRTE_ZZ_H)>; @@ -1681,13 +1784,13 @@ def : InstRW<[V1Write_3c_1V01], (instregex "^FEXPA_ZZ_[HSD]$", def : InstRW<[V1Write_4c_1V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>; // Dot product -def : InstRW<[V1Write_4c_1V01], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; +def : InstRW<[V1Wr_ZBFDOT, V1Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>; // Matrix multiply accumulate -def : InstRW<[V1Write_5c_1V01], (instrs BFMMLA_ZZZ)>; +def : InstRW<[V1Wr_ZBFMMA, V1Rd_ZBFMMA], (instrs BFMMLA_ZZZ)>; // Multiply accumulate long -def : InstRW<[V1Write_5c_1V01], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; +def : InstRW<[V1Wr_ZBFMAL, V1Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>; // SVE Load instructions diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s new file mode 100644 index 000000000000..4de37f960005 --- /dev/null +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-forwarding.s @@ -0,0 +1,1421 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=aarch64 -mcpu=neoverse-v1 -mattr=+sve --instruction-info=0 --resource-pressure=0 --timeline --timeline-max-iterations=2 < %s | FileCheck %s + +# LLVM-MCA-BEGIN madd +mul x0, x0, x0 +madd x0, x1, x2, x0 +madd x0, x1, x2, x0 +madd x0, x0, x0, x0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smaddl +mul x0, x0, x0 +smaddl x0, w1, w2, x0 +smaddl x0, w1, w2, x0 +smaddl x0, w0, w0, x0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmadd +fadd d0, d0, d0 +fmadd d0, d1, d2, d0 +fmul d0, d0, d0 +fmadd d0, d1, d2, d0 +fmadd d0, d1, d2, d0 +fmadd d0, d0, d1, d2 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN saba +mul v0.4s, v0.4s, v0.4s +saba v0.4s, v1.4s, v2.4s +saba v0.4s, v1.4s, v2.4s +saba v0.4s, v0.4s, v1.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sadalp +mul v0.4s, v0.4s, v0.4s +sadalp v0.2d, v1.4s +sadalp v0.2d, v1.4s +sadalp v0.2d, v0.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN sdot +mul v0.4s, v0.4s, v0.4s +sdot v0.4s, v1.16b, v2.16b +sdot v0.4s, v1.16b, v2.16b +sdot v0.4s, v0.16b, v1.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smmla +mul v0.4s, v0.4s, v0.4s +smmla v0.4s, v1.16b, v2.16b +smmla v0.4s, v1.16b, v2.16b +smmla v0.4s, v0.16b, v1.16b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN mla +mul v0.4s, v0.4s, v0.4s +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v1.4s, v2.4s +mla v0.4s, v0.4s, v1.4s +# LLVM-MCA-END + +# LLVM-MCA-BEGIN smlal2 +mul v0.4s, v0.4s, v0.4s +smlal2 v0.4s, v1.8h, v2.8h +smlal2 v0.4s, v1.8h, v2.8h +smlal2 v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN ssra +mul v0.4s, v0.4s, v0.4s +ssra v0.2d, v1.2d, #1 +ssra v0.2d, v1.2d, #1 +ssra v0.2d, v0.2d, #1 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fcmla +fmul v0.4s, v0.4s, v0.4s +fcmla v0.2d, v1.2d, v2.2d, #90 +fcmla v0.2d, v1.2d, v2.2d, #90 +fcmla v0.2d, v0.2d, v1.2d, #90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmla +fmul v0.2d, v0.2d, v0.2d +fmla v0.2d, v1.2d, v2.2d +fadd v0.2d, v0.2d, v0.2d +fmla v0.2d, v1.2d, v2.2d +fmla v0.2d, v1.2d, v2.2d +fmla v0.2d, v0.2d, v1.2d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN fmlal +fmul v0.2d, v0.2d, v0.2d +fmlal v0.4s, v1.4h, v2.4h +fadd v0.2d, v0.2d, v0.2d +fmlal v0.4s, v1.4h, v2.4h +fmlal v0.4s, v1.4h, v2.4h +fmlal v0.4s, v0.4h, v1.4h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfdot +fmul v0.2d, v0.2d, v0.2d +bfdot v0.4s, v1.8h, v2.8h +bfdot v0.4s, v1.8h, v2.8h +bfdot v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmmla +fmul v0.2d, v0.2d, v0.2d +bfmmla v0.4s, v1.8h, v2.8h +bfmmla v0.4s, v1.8h, v2.8h +bfmmla v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmlalb +fmul v0.2d, v0.2d, v0.2d +bfmlalb v0.4s, v1.8h, v2.8h +bfmlalb v0.4s, v1.8h, v2.8h +bfmlalb v0.4s, v0.8h, v1.8h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN crc32cb +mul w0, w0, w0 +crc32cb w0, w0, w1 +crc32cb w0, w0, w1 +crc32cb w0, w0, w0 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sdot.s +mul z0.d, p0/m, z0.d, z0.d +sdot z0.s, z1.b, z2.b +sdot z0.s, z1.b, z2.b +sdot z0.s, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sudot +mul z0.d, p0/m, z0.d, z0.d +sdot z0.s, z1.b, z2.b[1] +sdot z0.s, z1.b, z2.b[1] +sdot z0.s, z0.b, z1.b[1] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z sdot.d +mul z0.d, p0/m, z0.d, z0.d +sdot z0.d, z1.h, z2.h +sdot z0.d, z1.h, z2.h +sdot z0.d, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z smmla +mul z0.d, p0/m, z0.d, z0.d +smmla z0.s, z1.b, z2.b +smmla z0.s, z1.b, z2.b +smmla z0.s, z0.b, z1.b +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z mla.d +mul z0.d, p0/m, z0.d, z0.d +mla z0.d, p0/m, z1.d, z2.d +mla z0.d, p0/m, z1.d, z2.d +mla z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z mad.d +mul z0.d, p0/m, z0.d, z0.d +mad z0.d, p0/m, z1.d, z2.d +mad z0.d, p0/m, z1.d, z2.d +mad z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z msb.d +mul z0.d, p0/m, z0.d, z0.d +msb z0.d, p0/m, z1.d, z2.d +msb z0.d, p0/m, z1.d, z2.d +msb z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fcmla ZPmZZ +fmul z0.d, z0.d, z0.d +fcmla z0.d, p0/m, z1.d, z2.d, 90 +fcmla z0.d, p0/m, z1.d, z2.d, 90 +fcmla z0.d, p0/m, z0.d, z1.d, 90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fcmla ZZZI +fmul z0.d, z0.d, z0.d +fcmla z0.s, z1.s, z2.s[1], 90 +fcmla z0.s, z1.s, z2.s[1], 90 +fcmla z0.s, z0.s, z1.s[1], 90 +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmla ZPmZZ +fmul z0.d, z0.d, z0.d +fmla z0.d, p0/m, z1.d, z2.d +fmla z0.d, p0/m, z1.d, z2.d +fmla z0.d, p0/m, z0.d, z1.d +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z fmla ZZZI +fmul z0.d, z0.d, z0.d +fmla z0.d, z1.d, z2.d[1] +fmla z0.d, z1.d, z2.d[1] +fmla z0.d, z0.d, z1.d[1] +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z bfdot +fmul z0.d, z0.d, z0.d +bfdot z0.s, z1.h, z2.h +bfdot z0.s, z1.h, z2.h +bfdot z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN Z bfmmla +fmul z0.d, z0.d, z0.d +bfmmla z0.s, z1.h, z2.h +bfmmla z0.s, z1.h, z2.h +bfmmla z0.s, z0.h, z1.h +# LLVM-MCA-END + +# LLVM-MCA-BEGIN bfmlalb +fmul z0.d, z0.d, z0.d +bfmlalb z0.s, z1.h, z2.h +bfmlalb z0.s, z1.h, z2.h +bfmlalb z0.s, z0.h, z1.h +# LLVM-MCA-END + +# CHECK: [0] Code Region - madd + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul x0, x0, x0 +# CHECK-NEXT: [0,1] D==eeER . .. madd x0, x1, x2, x0 +# CHECK-NEXT: [0,2] D===eeER . .. madd x0, x1, x2, x0 +# CHECK-NEXT: [0,3] D=====eeER. .. madd x0, x0, x0, x0 +# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 +# CHECK-NEXT: [1,1] D=========eeER .. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,2] D==========eeER.. madd x0, x1, x2, x0 +# CHECK-NEXT: [1,3] D============eeER madd x0, x0, x0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 madd x0, x1, x2, x0 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 madd x0, x0, x0, x0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [1] Code Region - smaddl + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul x0, x0, x0 +# CHECK-NEXT: [0,1] D==eeER . .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [0,2] D===eeER . .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [0,3] D=====eeER. .. smaddl x0, w0, w0, x0 +# CHECK-NEXT: [1,0] D=======eeER .. mul x0, x0, x0 +# CHECK-NEXT: [1,1] D=========eeER .. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,2] D==========eeER.. smaddl x0, w1, w2, x0 +# CHECK-NEXT: [1,3] D============eeER smaddl x0, w0, w0, x0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul x0, x0, x0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 smaddl x0, w1, w2, x0 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 smaddl x0, w0, w0, x0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [2] Code Region - fmadd + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.35 +# CHECK-NEXT: IPC: 0.35 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeER. . . . . . .. fadd d0, d0, d0 +# CHECK-NEXT: [0,1] D==eeeeER . . . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,2] D======eeeER . . . . .. fmul d0, d0, d0 +# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmadd d0, d0, d1, d2 +# CHECK-NEXT: [1,0] D=================eeER . . .. fadd d0, d0, d0 +# CHECK-NEXT: [1,1] D===================eeeeER . .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,2] D=======================eeeER . .. fmul d0, d0, d0 +# CHECK-NEXT: [1,3] D========================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,4] D==========================eeeeER .. fmadd d0, d1, d2, d0 +# CHECK-NEXT: [1,5] D==============================eeeeER fmadd d0, d0, d1, d2 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fadd d0, d0, d0 +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 2. 2 15.5 0.0 0.0 fmul d0, d0, d0 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmadd d0, d1, d2, d0 +# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmadd d0, d0, d1, d2 +# CHECK-NEXT: 2 15.7 0.1 0.0 + +# CHECK: [3] Code Region - saba + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] D======================eeeeER saba v0.4s, v0.4s, v1.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 saba v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 saba v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [4] Code Region - sadalp + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . sadalp v0.2d, v0.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . sadalp v0.2d, v1.4s +# CHECK-NEXT: [1,3] D======================eeeeER sadalp v0.2d, v0.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 sadalp v0.2d, v1.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 sadalp v0.2d, v0.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [5] Code Region - sdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 0.8 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeER. . . . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,2] D=====eeeER . . . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,3] D========eeeER . . . sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D===============eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,2] D================eeeER . sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] D===================eeeER sdot v0.4s, v0.16b, v1.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 2. 2 11.5 0.0 0.0 sdot v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.5 0.0 0.0 sdot v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.8 0.1 0.0 + +# CHECK: [6] Code Region - smmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1103 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.36 +# CHECK-NEXT: Block RThroughput: 0.8 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 01234 + +# CHECK: [0,0] DeeeeER . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeER. . . . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,2] D=====eeeER . . . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [0,3] D========eeeER . . . smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: [1,0] D===========eeeeER . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D===============eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,2] D================eeeER . smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: [1,3] D===================eeeER smmla v0.4s, v0.16b, v1.16b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 6.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 2. 2 11.5 0.0 0.0 smmla v0.4s, v1.16b, v2.16b +# CHECK-NEXT: 3. 2 14.5 0.0 0.0 smmla v0.4s, v0.16b, v1.16b +# CHECK-NEXT: 2 10.8 0.1 0.0 + +# CHECK: [7] Code Region - mla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,2] D=====eeeeER . . . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [0,3] D=========eeeeER . . . mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,2] D==================eeeeER. . mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: [1,3] D======================eeeeER mla v0.4s, v0.4s, v1.4s + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 mla v0.4s, v1.4s, v2.4s +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 mla v0.4s, v0.4s, v1.4s +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [8] Code Region - smlal2 + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER smlal2 v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 smlal2 v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 smlal2 v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [9] Code Region - ssra + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeeER . . . . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D====eeeeER . . . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [0,3] D=========eeeeER . . . ssra v0.2d, v0.2d, #1 +# CHECK-NEXT: [1,0] D=============eeeeER. . . mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D=================eeeeER . . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [1,2] D==================eeeeER. . ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: [1,3] D======================eeeeER ssra v0.2d, v0.2d, #1 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 mul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 ssra v0.2d, v1.2d, #1 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 ssra v0.2d, v0.2d, #1 +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [10] Code Region - fcmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [0,3] D=========eeeeER . . . fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: [1,1] D================eeeeER . . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,2] D==================eeeeER. . fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: [1,3] D======================eeeeER fcmla v0.2d, v0.2d, v1.2d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.4s, v0.4s, v0.4s +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fcmla v0.2d, v1.2d, v2.2d, #90 +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fcmla v0.2d, v0.2d, v1.2d, #90 +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [11] Code Region - fmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.35 +# CHECK-NEXT: IPC: 0.35 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D=eeeeER . . . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,2] D=====eeER. . . . . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,3] D=======eeeeER . . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,4] D=========eeeeER . . . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [0,5] D=============eeeeER. . . .. fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: [1,0] D=================eeeER . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D==================eeeeER. . .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,2] D======================eeER . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] D========================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,4] D==========================eeeeER .. fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: [1,5] D==============================eeeeER fmla v0.2d, v0.2d, v1.2d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 2. 2 14.5 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 4. 2 18.5 0.0 0.0 fmla v0.2d, v1.2d, v2.2d +# CHECK-NEXT: 5. 2 22.5 0.0 0.0 fmla v0.2d, v0.2d, v1.2d +# CHECK-NEXT: 2 15.3 0.1 0.0 + +# CHECK: [12] Code Region - fmlal + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 600 +# CHECK-NEXT: Total Cycles: 2203 +# CHECK-NEXT: Total uOps: 600 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 1.5 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456789 +# CHECK-NEXT: Index 0123456789 0123456789 0123456 + +# CHECK: [0,0] DeeeER . . . . . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,2] D========eeER . . . . . . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,4] D============eeeeeER. . . . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [0,5] D=================eeeeeER. . . . .. fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: [1,0] D======================eeeER . . . .. fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D=========================eeeeeER . . .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,2] D==============================eeER. . .. fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,3] D================================eeeeeER. .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,4] D==================================eeeeeER .. fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: [1,5] D=======================================eeeeeER fmlal v0.4s, v0.4h, v1.4h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 12.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 15.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 2. 2 20.0 0.0 0.0 fadd v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 3. 2 22.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 4. 2 24.0 0.0 0.0 fmlal v0.4s, v1.4h, v2.4h +# CHECK-NEXT: 5. 2 29.0 0.0 0.0 fmlal v0.4s, v0.4h, v1.4h +# CHECK-NEXT: 2 20.3 0.1 0.0 + +# CHECK: [13] Code Region - bfdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D================eeeeER . . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER bfdot v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfdot v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfdot v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [14] Code Region - bfmmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1603 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D================eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D===================eeeeeER . . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D======================eeeeeER. . bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D===========================eeeeeER bfmmla v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfmmla v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfmmla v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 14.0 0.1 0.0 + +# CHECK: [15] Code Region - bfmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 1.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: [1,1] D================eeeeER . . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: [1,3] D======================eeeeER bfmlalb v0.4s, v0.8h, v1.8h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul v0.2d, v0.2d, v0.2d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfmlalb v0.4s, v1.8h, v2.8h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfmlalb v0.4s, v0.8h, v1.8h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [16] Code Region - crc32cb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 703 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.57 +# CHECK-NEXT: IPC: 0.57 +# CHECK-NEXT: Block RThroughput: 3.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeeER. . .. mul w0, w0, w0 +# CHECK-NEXT: [0,1] D==eeER . .. crc32cb w0, w0, w1 +# CHECK-NEXT: [0,2] D===eeER . .. crc32cb w0, w0, w1 +# CHECK-NEXT: [0,3] D=====eeER. .. crc32cb w0, w0, w0 +# CHECK-NEXT: [1,0] D=======eeER .. mul w0, w0, w0 +# CHECK-NEXT: [1,1] D=========eeER .. crc32cb w0, w0, w1 +# CHECK-NEXT: [1,2] D==========eeER.. crc32cb w0, w0, w1 +# CHECK-NEXT: [1,3] D============eeER crc32cb w0, w0, w0 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 4.5 0.5 0.0 mul w0, w0, w0 +# CHECK-NEXT: 1. 2 6.5 0.0 0.0 crc32cb w0, w0, w1 +# CHECK-NEXT: 2. 2 7.5 0.0 0.0 crc32cb w0, w0, w1 +# CHECK-NEXT: 3. 2 9.5 0.0 0.0 crc32cb w0, w0, w0 +# CHECK-NEXT: 2 7.0 0.1 0.0 + +# CHECK: [17] Code Region - Z sdot.s + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [18] Code Region - Z sudot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.s, z0.b, z1.b[1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.s, z1.b, z2.b[1] +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.s, z0.b, z1.b[1] +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [19] Code Region - Z sdot.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1403 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.36 +# CHECK-NEXT: IPC: 0.29 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeER . . . . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [0,2] D======eeeeER . . . . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [0,3] D==========eeeeER . . . sdot z0.d, z0.h, z1.h +# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D===================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,2] D====================eeeeER . sdot z0.d, z1.h, z2.h +# CHECK-NEXT: [1,3] D========================eeeeER sdot z0.d, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sdot z0.d, z1.h, z2.h +# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sdot z0.d, z0.h, z1.h +# CHECK-NEXT: 2 13.3 0.1 0.0 + +# CHECK: [20] Code Region - Z smmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1203 +# CHECK-NEXT: Total uOps: 500 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.42 +# CHECK-NEXT: IPC: 0.33 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 0123456 + +# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeER . . .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [0,2] D======eeeER . . .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [0,3] D=========eeeER. . .. smmla z0.s, z0.b, z1.b +# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D=================eeeER .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,2] D==================eeeER .. smmla z0.s, z1.b, z2.b +# CHECK-NEXT: [1,3] D=====================eeeER smmla z0.s, z0.b, z1.b + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 2. 2 13.0 0.0 0.0 smmla z0.s, z1.b, z2.b +# CHECK-NEXT: 3. 2 16.0 0.0 0.0 smmla z0.s, z0.b, z1.b +# CHECK-NEXT: 2 12.0 0.1 0.0 + +# CHECK: [21] Code Region - Z mla.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER mla z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 mla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 mla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [22] Code Region - Z mad.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. mad z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER mad z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 mad z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 mad z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [23] Code Region - Z msb.d + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1703 +# CHECK-NEXT: Total uOps: 800 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.47 +# CHECK-NEXT: IPC: 0.23 +# CHECK-NEXT: Block RThroughput: 8.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 0123456 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeeeER . . . . . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [0,1] D=====eeeeeER . . . . .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=======eeeeeER. . . . .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D============eeeeeER. . . .. msb z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=================eeeeeER. . .. mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: [1,1] D======================eeeeeER. .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D========================eeeeeER .. msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] .D============================eeeeeER msb z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.5 0.5 0.0 mul z0.d, p0/m, z0.d, z0.d +# CHECK-NEXT: 1. 2 14.5 0.0 0.0 msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 16.5 0.0 0.0 msb z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 21.0 0.0 0.0 msb z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 15.4 0.1 0.0 + +# CHECK: [24] Code Region - Z fcmla ZPmZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.d, p0/m, z0.d, z1.d, #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.d, p0/m, z1.d, z2.d, #90 +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.d, p0/m, z0.d, z1.d, #90 +# CHECK-NEXT: 2 13.0 0.1 0.0 + +# CHECK: [25] Code Region - Z fcmla ZZZI + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,2] D====================eeeeeER . . fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: [1,3] D=========================eeeeeER fcmla z0.s, z0.s, z1.s[1], #90 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 fcmla z0.s, z1.s, z2.s[1], #90 +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 fcmla z0.s, z0.s, z1.s[1], #90 +# CHECK-NEXT: 2 13.0 0.1 0.0 + +# CHECK: [26] Code Region - Z fmla ZPmZZ + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, p0/m, z0.d, z1.d + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, p0/m, z1.d, z2.d +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, p0/m, z0.d, z1.d +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [27] Code Region - Z fmla ZZZI + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [0,2] D=====eeeeER . . . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [0,3] D=========eeeeER . . . fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,2] D==================eeeeER. . fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: [1,3] D======================eeeeER fmla z0.d, z0.d, z1.d[1] + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 fmla z0.d, z1.d, z2.d[1] +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 fmla z0.d, z0.d, z1.d[1] +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [28] Code Region - Z bfdot + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1303 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.31 +# CHECK-NEXT: IPC: 0.31 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 + +# CHECK: [0,0] DeeeER . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeER. . . . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeER . . . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D=========eeeeER . . . bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D=============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D================eeeeER . . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D==================eeeeER. . bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D======================eeeeER bfdot z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 7.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 10.5 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 12.5 0.0 0.0 bfdot z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 16.5 0.0 0.0 bfdot z0.s, z0.h, z1.h +# CHECK-NEXT: 2 11.8 0.1 0.0 + +# CHECK: [29] Code Region - Z bfmmla + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1603 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.25 +# CHECK-NEXT: IPC: 0.25 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 01234 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D======eeeeeER . . . . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D===========eeeeeER . . . . bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D================eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D===================eeeeeER . . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D======================eeeeeER. . bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D===========================eeeeeER bfmmla z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 9.0 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 12.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 15.0 0.0 0.0 bfmmla z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 20.0 0.0 0.0 bfmmla z0.s, z0.h, z1.h +# CHECK-NEXT: 2 14.0 0.1 0.0 + +# CHECK: [30] Code Region - bfmlalb + +# CHECK: Iterations: 100 +# CHECK-NEXT: Instructions: 400 +# CHECK-NEXT: Total Cycles: 1503 +# CHECK-NEXT: Total uOps: 400 + +# CHECK: Dispatch Width: 15 +# CHECK-NEXT: uOps Per Cycle: 0.27 +# CHECK-NEXT: IPC: 0.27 +# CHECK-NEXT: Block RThroughput: 2.0 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123456789 012 +# CHECK-NEXT: Index 0123456789 0123456789 + +# CHECK: [0,0] DeeeER . . . . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [0,1] D===eeeeeER . . . . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,2] D=====eeeeeER . . . . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [0,3] D==========eeeeeER . . . . bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: [1,0] D===============eeeER . . . fmul z0.d, z0.d, z0.d +# CHECK-NEXT: [1,1] D==================eeeeeER . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,2] D====================eeeeeER . . bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: [1,3] D=========================eeeeeER bfmlalb z0.s, z0.h, z1.h + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 8.5 0.5 0.0 fmul z0.d, z0.d, z0.d +# CHECK-NEXT: 1. 2 11.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 2. 2 13.5 0.0 0.0 bfmlalb z0.s, z1.h, z2.h +# CHECK-NEXT: 3. 2 18.5 0.0 0.0 bfmlalb z0.s, z0.h, z1.h +# CHECK-NEXT: 2 13.0 0.1 0.0 diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s index 1e8df4770d79..65b73177c7b7 100644 --- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s +++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-neon-instructions.s @@ -1365,8 +1365,8 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 fcmgt s10, s11, s12 # CHECK-NEXT: 1 2 0.25 fcmgt v0.4s, v0.4s, #0.0 # CHECK-NEXT: 1 2 0.25 fcmgt v0.4s, v0.4s, v0.4s -# CHECK-NEXT: 1 2 0.25 fcmla v0.2s, v0.2s, v0.2s, #90 -# CHECK-NEXT: 1 2 0.25 fcmla v0.4s, v0.4s, v0.s[1], #0 +# CHECK-NEXT: 1 4 0.25 fcmla v0.2s, v0.2s, v0.2s, #90 +# CHECK-NEXT: 1 4 0.25 fcmla v0.4s, v0.4s, v0.s[1], #0 # CHECK-NEXT: 1 2 0.25 fcmle d20, d21, #0.0 # CHECK-NEXT: 1 2 0.25 fcmle s10, s11, #0.0 # CHECK-NEXT: 1 2 0.25 fcmle v0.2d, v0.2d, #0.0 @@ -1651,7 +1651,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 7 8 1.00 * ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] # CHECK-NEXT: 8 8 1.00 * ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 # CHECK-NEXT: 8 8 1.00 * ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8 -# CHECK-NEXT: 1 2 0.25 mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 mla v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 4 0.50 mls v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 2 0.25 mov b0, v0.b[15] # CHECK-NEXT: 1 2 0.25 mov d6, v0.d[1] @@ -1673,7 +1673,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 movi v0.2s, #8, msl #8 # CHECK-NEXT: 1 2 0.25 movi v0.4s, #255, lsl #24 # CHECK-NEXT: 1 2 0.25 movi v0.8b, #255 -# CHECK-NEXT: 1 2 0.25 mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: 1 4 0.50 mul v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 mvni v0.2s, #0 # CHECK-NEXT: 1 2 0.25 mvni v0.4s, #16, msl #16 # CHECK-NEXT: 1 2 0.25 neg d29, d24 @@ -1780,10 +1780,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 2 4 1.00 scvtf v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 scvtf v0.4s, v0.4s, #3 # CHECK-NEXT: 4 6 1.00 scvtf v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sdot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 sdot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 sdot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 sdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 sdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 sdot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 shadd v0.8b, v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 shl d7, d10, #12 # CHECK-NEXT: 1 2 0.50 shl v0.16b, v0.16b, #3 @@ -1873,26 +1873,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 sqadd b20, b11, b15 # CHECK-NEXT: 1 2 0.25 sqadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.25 sqadd v0.2s, v0.2s, v0.2s -# CHECK-NEXT: 1 2 0.25 sqdmlal d19, s24, s12 +# CHECK-NEXT: 1 4 0.50 sqdmlal d19, s24, s12 # CHECK-NEXT: 1 4 0.50 sqdmlal d8, s9, v0.s[1] # CHECK-NEXT: 1 4 0.50 sqdmlal s0, h0, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmlal s17, h27, h12 +# CHECK-NEXT: 1 4 0.50 sqdmlal s17, h27, h12 # CHECK-NEXT: 1 4 0.50 sqdmlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqdmlal2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 sqdmlal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqdmlsl d12, s23, s13 +# CHECK-NEXT: 1 4 0.50 sqdmlsl d12, s23, s13 # CHECK-NEXT: 1 4 0.50 sqdmlsl d8, s9, v0.s[1] # CHECK-NEXT: 1 4 0.50 sqdmlsl s0, h0, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmlsl s14, h12, h25 +# CHECK-NEXT: 1 4 0.50 sqdmlsl s14, h12, h25 # CHECK-NEXT: 1 4 0.50 sqdmlsl v0.2d, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmlsl v0.4s, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqdmlsl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: 1 4 0.50 sqdmlsl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqdmulh h10, h11, h12 -# CHECK-NEXT: 1 2 0.25 sqdmulh h7, h15, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqdmulh s15, s14, v0.s[1] -# CHECK-NEXT: 1 2 0.25 sqdmulh s20, s21, s2 +# CHECK-NEXT: 1 4 0.50 sqdmulh h10, h11, h12 +# CHECK-NEXT: 1 4 0.50 sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqdmulh s20, s21, s2 # CHECK-NEXT: 1 4 0.50 sqdmulh v0.2s, v0.2s, v0.2s # CHECK-NEXT: 1 4 0.50 sqdmulh v0.4s, v0.4s, v0.4s # CHECK-NEXT: 1 3 0.50 sqdmull d1, s1, v0.s[1] @@ -1914,34 +1914,34 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 sqneg v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 sqneg v0.8b, v0.8b # CHECK-NEXT: 1 2 0.25 sqneg v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 sqrdmlah h0, h1, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlah s0, s1, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlah v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah h0, h1, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlah s0, s1, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: 1 4 0.50 sqrdmlah h0, h1, h2 # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4h, v1.4h, v2.4h # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.8h, v1.8h, v2.8h # CHECK-NEXT: 1 4 0.50 sqrdmlah s0, s1, s2 # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.2s, v1.2s, v2.2s # CHECK-NEXT: 1 4 0.50 sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 1 2 0.25 sqrdmlsh h0, h1, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh s0, s1, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmlsh v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh h0, h1, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh s0, s1, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: 1 4 0.50 sqrdmlsh h0, h1, h2 # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4h, v1.4h, v2.4h # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.8h, v1.8h, v2.8h # CHECK-NEXT: 1 4 0.50 sqrdmlsh s0, s1, s2 # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.2s, v1.2s, v2.2s # CHECK-NEXT: 1 4 0.50 sqrdmlsh v0.4s, v1.4s, v2.4s -# CHECK-NEXT: 1 2 0.25 sqrdmulh h10, h11, h12 -# CHECK-NEXT: 1 2 0.25 sqrdmulh h7, h15, v0.h[3] -# CHECK-NEXT: 1 2 0.25 sqrdmulh s15, s14, v0.s[1] -# CHECK-NEXT: 1 2 0.25 sqrdmulh s20, s21, s2 +# CHECK-NEXT: 1 4 0.50 sqrdmulh h10, h11, h12 +# CHECK-NEXT: 1 4 0.50 sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: 1 4 0.50 sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: 1 4 0.50 sqrdmulh s20, s21, s2 # CHECK-NEXT: 1 4 0.50 sqrdmulh v0.4h, v0.4h, v0.4h # CHECK-NEXT: 1 4 0.50 sqrdmulh v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 sqrshl d31, d31, d31 @@ -2124,8 +2124,8 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 5 4 1.00 * st4 { v0.d, v1.d, v2.d, v3.d }[1], [x0], x5 # CHECK-NEXT: 1 2 0.25 sub d15, d5, d16 # CHECK-NEXT: 1 2 0.25 sub v0.2d, v0.2d, v0.2d -# CHECK-NEXT: 1 2 0.25 sudot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 sudot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sudot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 sudot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 suqadd b19, b14 # CHECK-NEXT: 1 2 0.25 suqadd d18, d22 # CHECK-NEXT: 1 2 0.25 suqadd h20, h15 @@ -2222,10 +2222,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 2 4 1.00 ucvtf v0.4s, v0.4s # CHECK-NEXT: 1 2 0.25 ucvtf v0.4s, v0.4s, #3 # CHECK-NEXT: 4 6 1.00 ucvtf v0.8h, v0.8h -# CHECK-NEXT: 1 2 0.25 udot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 udot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 udot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 udot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 udot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 udot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 udot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 udot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.25 uhadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.25 uhadd v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 2 0.25 uhsub v0.4s, v0.4s, v0.4s @@ -2356,10 +2356,10 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: 1 4 0.50 ursra v0.4s, v0.4s, #3 # CHECK-NEXT: 1 4 0.50 ursra v0.8b, v0.8b, #3 # CHECK-NEXT: 1 4 0.50 ursra v0.8h, v0.8h, #3 -# CHECK-NEXT: 1 2 0.25 usdot v0.2s, v0.8b, v0.4b[2] -# CHECK-NEXT: 1 2 0.25 usdot v0.2s, v0.8b, v0.8b -# CHECK-NEXT: 1 2 0.25 usdot v0.4s, v0.16b, v0.16b -# CHECK-NEXT: 1 2 0.25 usdot v0.4s, v0.16b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 usdot v0.2s, v0.8b, v0.4b[2] +# CHECK-NEXT: 1 3 0.25 usdot v0.2s, v0.8b, v0.8b +# CHECK-NEXT: 1 3 0.25 usdot v0.4s, v0.16b, v0.16b +# CHECK-NEXT: 1 3 0.25 usdot v0.4s, v0.16b, v0.4b[2] # CHECK-NEXT: 1 2 0.50 ushl d0, d0, d0 # CHECK-NEXT: 1 2 0.50 ushl v0.16b, v0.16b, v0.16b # CHECK-NEXT: 1 2 0.50 ushl v0.4s, v0.4s, v0.4s @@ -2465,7 +2465,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] -# CHECK-NEXT: - - - - 26.67 49.17 49.17 18.75 7.75 7.75 7.75 394.50 377.00 349.00 331.50 +# CHECK-NEXT: - - - - 26.67 49.17 49.17 18.75 7.75 7.75 7.75 401.00 370.50 355.50 325.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0.0] [0.1] [1.0] [1.1] [2] [3.0] [3.1] [4] [5] [6.0] [6.1] [7] [8] [9] [10] Instructions: @@ -2892,7 +2892,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - 1.00 1.00 1.00 - - - - 1.00 1.00 1.00 1.00 ld4r { v0.2d, v1.2d, v2.2d, v3.2d }, [sp] # CHECK-NEXT: - - - - 1.00 1.00 1.00 0.25 0.25 0.25 0.25 1.00 1.00 1.00 1.00 ld4r { v0.2s, v1.2s, v2.2s, v3.2s }, [sp], #16 # CHECK-NEXT: - - - - 1.00 1.00 1.00 0.25 0.25 0.25 0.25 1.00 1.00 1.00 1.00 ld4r { v0.4s, v1.4s, v2.4s, v3.4s }, [sp], x8 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mla v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mla v0.8b, v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mls v0.4h, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mov b0, v0.b[15] # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mov d6, v0.d[1] @@ -2914,7 +2914,7 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.2s, #8, msl #8 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.4s, #255, lsl #24 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 movi v0.8b, #255 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mul v0.8b, v0.8b, v0.8b +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - mul v0.8b, v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mvni v0.2s, #0 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 mvni v0.4s, #16, msl #16 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 neg d29, d24 @@ -3114,26 +3114,26 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd b20, b11, b15 # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd v0.16b, v0.16b, v0.16b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqadd v0.2s, v0.2s, v0.2s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlal d19, s24, s12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal d19, s24, s12 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal d8, s9, v0.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal s0, h0, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlal s17, h27, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal s17, h27, h12 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlal2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlsl d12, s23, s13 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl d12, s23, s13 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl d8, s9, v0.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl s0, h0, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmlsl s14, h12, h25 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl s14, h12, h25 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl v0.2d, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl v0.4s, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl2 v0.2d, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmlsl2 v0.4s, v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh h10, h11, h12 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh h7, h15, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh s15, s14, v0.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh s20, s21, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh v0.2s, v0.2s, v0.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmulh v0.4s, v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqdmull d1, s1, v0.s[1] @@ -3155,34 +3155,34 @@ zip2 v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.4s, v0.4s # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.8b, v0.8b # CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqneg v0.8h, v0.8h -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah h0, h1, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah s0, s1, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlah v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah h0, h1, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah s0, s1, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah h0, h1, h2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4h, v1.4h, v2.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.8h, v1.8h, v2.8h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah s0, s1, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.2s, v1.2s, v2.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlah v0.4s, v1.4s, v2.4s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh h0, h1, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.4h, v1.4h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.8h, v1.8h, v2.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh s0, s1, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.2s, v1.2s, v2.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmlsh v0.4s, v1.4s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh h0, h1, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4h, v1.4h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.8h, v1.8h, v2.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh s0, s1, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.2s, v1.2s, v2.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4s, v1.4s, v2.s[1] # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh h0, h1, h2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4h, v1.4h, v2.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.8h, v1.8h, v2.8h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh s0, s1, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.2s, v1.2s, v2.2s # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmlsh v0.4s, v1.4s, v2.4s -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh h10, h11, h12 -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh h7, h15, v0.h[3] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh s15, s14, v0.s[1] -# CHECK-NEXT: - - - - - - - - - - - 0.25 0.25 0.25 0.25 sqrdmulh s20, s21, s2 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh h10, h11, h12 +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh h7, h15, v0.h[3] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh s15, s14, v0.s[1] +# CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh s20, s21, s2 # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh v0.4h, v0.4h, v0.4h # CHECK-NEXT: - - - - - - - - - - - 0.50 - 0.50 - sqrdmulh v0.8h, v0.8h, v0.8h # CHECK-NEXT: - - - - - - - - - - - - 0.50 - 0.50 sqrshl d31, d31, d31 -- GitLab From 90627a5a190a99ae2991d524580d866484aaba16 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Fri, 11 Oct 2024 14:01:58 +0200 Subject: [PATCH 025/345] Revert "[XRay] Add support for instrumentation of DSOs on x86_64 (#90959)" This reverts commit a4402039bffd788b9af82435fd5a2fb311fdc6e8 and 4451f9f812d458f6b53785b27869674caf01e67b --- clang/include/clang/Basic/CodeGenOptions.def | 2 - clang/include/clang/Driver/Options.td | 5 - clang/include/clang/Driver/XRayArgs.h | 2 - clang/lib/Driver/ToolChains/CommonArgs.cpp | 12 +- clang/lib/Driver/XRayArgs.cpp | 21 -- clang/test/Driver/XRay/xray-shared.cpp | 17 - .../cmake/Modules/AllSupportedArchDefs.cmake | 1 - compiler-rt/cmake/config-ix.cmake | 4 - compiler-rt/include/xray/xray_interface.h | 65 +--- compiler-rt/lib/xray/CMakeLists.txt | 86 +----- compiler-rt/lib/xray/xray_dso_init.cpp | 62 ---- compiler-rt/lib/xray/xray_init.cpp | 183 ++--------- compiler-rt/lib/xray/xray_interface.cpp | 291 ++++-------------- .../lib/xray/xray_interface_internal.h | 83 +---- compiler-rt/lib/xray/xray_trampoline_x86_64.S | 24 +- compiler-rt/lib/xray/xray_x86_64.cpp | 23 +- .../xray/TestCases/Posix/basic-mode-dso.cpp | 47 --- .../TestCases/Posix/clang-xray-shared.cpp | 14 - .../test/xray/TestCases/Posix/dlopen.cpp | 107 ------- .../xray/TestCases/Posix/dso-dep-chains.cpp | 197 ------------ .../TestCases/Posix/patch-premain-dso.cpp | 45 --- .../Posix/patching-unpatching-dso.cpp | 75 ----- 22 files changed, 147 insertions(+), 1219 deletions(-) delete mode 100644 clang/test/Driver/XRay/xray-shared.cpp delete mode 100644 compiler-rt/lib/xray/xray_dso_init.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/dlopen.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp delete mode 100644 compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index e45370bde74a..eac831278ee2 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -136,8 +136,6 @@ CODEGENOPT(XRayIgnoreLoops , 1, 0) ///< Emit the XRay function index section. CODEGENOPT(XRayFunctionIndex , 1, 1) -///< Set when -fxray-shared is enabled -CODEGENOPT(XRayShared , 1, 0) ///< Set the minimum number of instructions in a function to determine selective ///< XRay instrumentation. diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 4ee16e213d0e..d306c751505e 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2946,11 +2946,6 @@ def fxray_selected_function_group : HelpText<"When using -fxray-function-groups, select which group of functions to instrument. Valid range is 0 to fxray-function-groups - 1">, MarshallingInfoInt, "0">; -defm xray_shared : BoolFOption<"xray-shared", - CodeGenOpts<"XRayShared">, DefaultFalse, - PosFlag, - NegFlag>; defm fine_grained_bitfield_accesses : BoolOption<"f", "fine-grained-bitfield-accesses", CodeGenOpts<"FineGrainedBitfieldAccesses">, DefaultFalse, diff --git a/clang/include/clang/Driver/XRayArgs.h b/clang/include/clang/Driver/XRayArgs.h index 1b5c4a4c42f1..bdd3d979547e 100644 --- a/clang/include/clang/Driver/XRayArgs.h +++ b/clang/include/clang/Driver/XRayArgs.h @@ -27,7 +27,6 @@ class XRayArgs { XRayInstrSet InstrumentationBundle; llvm::opt::Arg *XRayInstrument = nullptr; bool XRayRT = true; - bool XRayShared = false; public: /// Parses the XRay arguments from an argument list. @@ -36,7 +35,6 @@ public: llvm::opt::ArgStringList &CmdArgs, types::ID InputType) const; bool needsXRayRt() const { return XRayInstrument && XRayRT; } - bool needsXRayDSORt() const { return XRayInstrument && XRayRT && XRayShared; } llvm::ArrayRef modeList() const { return Modes; } XRayInstrSet instrumentationBundle() const { return InstrumentationBundle; } }; diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp index 0a1b7c209563..0c6a585c3acf 100644 --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -1613,14 +1613,10 @@ bool tools::addSanitizerRuntimes(const ToolChain &TC, const ArgList &Args, } bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args, ArgStringList &CmdArgs) { - if (Args.hasArg(options::OPT_shared)) { - if (TC.getXRayArgs().needsXRayDSORt()) { - CmdArgs.push_back("--whole-archive"); - CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray-dso")); - CmdArgs.push_back("--no-whole-archive"); - return true; - } - } else if (TC.getXRayArgs().needsXRayRt()) { + if (Args.hasArg(options::OPT_shared)) + return false; + + if (TC.getXRayArgs().needsXRayRt()) { CmdArgs.push_back("--whole-archive"); CmdArgs.push_back(TC.getCompilerRTArgString(Args, "xray")); for (const auto &Mode : TC.getXRayArgs().modeList()) diff --git a/clang/lib/Driver/XRayArgs.cpp b/clang/lib/Driver/XRayArgs.cpp index d0bb5d4887c1..8c5134e25013 100644 --- a/clang/lib/Driver/XRayArgs.cpp +++ b/clang/lib/Driver/XRayArgs.cpp @@ -63,23 +63,6 @@ XRayArgs::XRayArgs(const ToolChain &TC, const ArgList &Args) { << XRayInstrument->getSpelling() << Triple.str(); } - if (Args.hasFlag(options::OPT_fxray_shared, options::OPT_fno_xray_shared, - false)) { - XRayShared = true; - - // DSO instrumentation is currently limited to x86_64 - if (Triple.getArch() != llvm::Triple::x86_64) { - D.Diag(diag::err_drv_unsupported_opt_for_target) - << "-fxray-shared" << Triple.str(); - } - - unsigned PICLvl = std::get<1>(tools::ParsePICArgs(TC, Args)); - if (!PICLvl) { - D.Diag(diag::err_opt_not_valid_without_opt) << "-fxray-shared" - << "-fPIC"; - } - } - // Both XRay and -fpatchable-function-entry use // TargetOpcode::PATCHABLE_FUNCTION_ENTER. if (Arg *A = Args.getLastArg(options::OPT_fpatchable_function_entry_EQ)) @@ -194,10 +177,6 @@ void XRayArgs::addArgs(const ToolChain &TC, const ArgList &Args, Args.addOptOutFlag(CmdArgs, options::OPT_fxray_function_index, options::OPT_fno_xray_function_index); - if (XRayShared) - Args.addOptInFlag(CmdArgs, options::OPT_fxray_shared, - options::OPT_fno_xray_shared); - if (const Arg *A = Args.getLastArg(options::OPT_fxray_instruction_threshold_EQ)) { int Value; diff --git a/clang/test/Driver/XRay/xray-shared.cpp b/clang/test/Driver/XRay/xray-shared.cpp deleted file mode 100644 index 215854e1fc7c..000000000000 --- a/clang/test/Driver/XRay/xray-shared.cpp +++ /dev/null @@ -1,17 +0,0 @@ -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fpic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s -// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s -// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-PIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC -// RUN: not %clang -### --target=x86_64-unknown-linux-gnu -fno-pic -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-PIC - -// On 64 bit darwin, PIC is always enabled -// RUN: %clang -### --target=x86_64-apple-darwin -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s - -// Check unsupported targets -// RUN: not %clang -### --target=aarch64-pc-freebsd -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET -// RUN: not %clang -### --target=arm64-apple-macos -fPIC -fxray-instrument -fxray-shared -c %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR-TARGET - -// CHECK: "-cc1" {{.*}}"-fxray-instrument" {{.*}}"-fxray-shared" -// ERR-TARGET: error: unsupported option '-fxray-shared' for target -// ERR-PIC: error: option '-fxray-shared' cannot be specified without '-fPIC' - diff --git a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake index 50a4256b82fe..809e92771569 100644 --- a/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake +++ b/compiler-rt/cmake/Modules/AllSupportedArchDefs.cmake @@ -104,7 +104,6 @@ else() set(ALL_XRAY_SUPPORTED_ARCH ${X86_64} ${ARM32} ${ARM64} ${MIPS32} ${MIPS64} powerpc64le ${HEXAGON} ${LOONGARCH64}) endif() -set(ALL_XRAY_DSO_SUPPORTED_ARCH ${X86_64}) set(ALL_SHADOWCALLSTACK_SUPPORTED_ARCH ${ARM64}) if (UNIX) diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake index 6134c9876b38..a93a88a92050 100644 --- a/compiler-rt/cmake/config-ix.cmake +++ b/compiler-rt/cmake/config-ix.cmake @@ -668,9 +668,6 @@ if(APPLE) list_intersect(XRAY_SUPPORTED_ARCH ALL_XRAY_SUPPORTED_ARCH SANITIZER_COMMON_SUPPORTED_ARCH) - list_intersect(XRAY_DSO_SUPPORTED_ARCH - ALL_XRAY_DSO_SUPPORTED_ARCH - SANITIZER_COMMON_SUPPORTED_ARCH) list_intersect(SHADOWCALLSTACK_SUPPORTED_ARCH ALL_SHADOWCALLSTACK_SUPPORTED_ARCH SANITIZER_COMMON_SUPPORTED_ARCH) @@ -705,7 +702,6 @@ else() filter_available_targets(CFI_SUPPORTED_ARCH ${ALL_CFI_SUPPORTED_ARCH}) filter_available_targets(SCUDO_STANDALONE_SUPPORTED_ARCH ${ALL_SCUDO_STANDALONE_SUPPORTED_ARCH}) filter_available_targets(XRAY_SUPPORTED_ARCH ${ALL_XRAY_SUPPORTED_ARCH}) - filter_available_targets(XRAY_DSO_SUPPORTED_ARCH ${ALL_XRAY_DSO_SUPPORTED_ARCH}) filter_available_targets(SHADOWCALLSTACK_SUPPORTED_ARCH ${ALL_SHADOWCALLSTACK_SUPPORTED_ARCH}) filter_available_targets(GWP_ASAN_SUPPORTED_ARCH ${ALL_GWP_ASAN_SUPPORTED_ARCH}) diff --git a/compiler-rt/include/xray/xray_interface.h b/compiler-rt/include/xray/xray_interface.h index 675ea0cbc48c..727431c04e4f 100644 --- a/compiler-rt/include/xray/xray_interface.h +++ b/compiler-rt/include/xray/xray_interface.h @@ -93,78 +93,31 @@ enum XRayPatchingStatus { FAILED = 3, }; -/// This tells XRay to patch the instrumentation points in all currently loaded -/// objects. See XRayPatchingStatus for possible result values. +/// This tells XRay to patch the instrumentation points. See XRayPatchingStatus +/// for possible result values. extern XRayPatchingStatus __xray_patch(); -/// This tells XRay to patch the instrumentation points in the given object. -/// See XRayPatchingStatus for possible result values. -extern XRayPatchingStatus __xray_patch_object(int32_t ObjId); - /// Reverses the effect of __xray_patch(). See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_unpatch(); -/// Reverses the effect of __xray_patch_object. See XRayPatchingStatus for -/// possible result values. -extern XRayPatchingStatus __xray_unpatch_object(int32_t ObjId); - -/// This unpacks the given (packed) function id and patches -/// the corresponding function. See XRayPatchingStatus for possible +/// This patches a specific function id. See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_patch_function(int32_t FuncId); -/// This patches a specific function in the given object. See XRayPatchingStatus -/// for possible result values. -extern XRayPatchingStatus __xray_patch_function_in_object(int32_t FuncId, - int32_t ObjId); - -/// This unpacks the given (packed) function id and unpatches -/// the corresponding function. See XRayPatchingStatus for possible +/// This unpatches a specific function id. See XRayPatchingStatus for possible /// result values. extern XRayPatchingStatus __xray_unpatch_function(int32_t FuncId); -/// This unpatches a specific function in the given object. -/// See XRayPatchingStatus for possible result values. -extern XRayPatchingStatus __xray_unpatch_function_in_object(int32_t FuncId, - int32_t ObjId); - -/// This function unpacks the given (packed) function id and returns the address -/// of the corresponding function. We return 0 if we encounter any error, even -/// if 0 may be a valid function address. +/// This function returns the address of the function provided a valid function +/// id. We return 0 if we encounter any error, even if 0 may be a valid function +/// address. extern uintptr_t __xray_function_address(int32_t FuncId); -/// This function returns the address of the function in the given object -/// provided valid function and object ids. We return 0 if we encounter any -/// error, even if 0 may be a valid function address. -extern uintptr_t __xray_function_address_in_object(int32_t FuncId, - int32_t ObjId); - -/// This function returns the maximum valid function id for the main executable -/// (object id = 0). Returns 0 if we encounter errors (when there are no -/// instrumented functions, etc.). +/// This function returns the maximum valid function id. Returns 0 if we +/// encounter errors (when there are no instrumented functions, etc.). extern size_t __xray_max_function_id(); -/// This function returns the maximum valid function id for the given object. -/// Returns 0 if we encounter errors (when there are no instrumented functions, -/// etc.). -extern size_t __xray_max_function_id_in_object(int32_t ObjId); - -/// This function returns the number of previously registered objects -/// (executable + loaded DSOs). Returns 0 if XRay has not been initialized. -extern size_t __xray_num_objects(); - -/// Unpacks the function id from the given packed id. -extern int32_t __xray_unpack_function_id(int32_t PackedId); - -/// Unpacks the object id from the given packed id. -extern int32_t __xray_unpack_object_id(int32_t PackedId); - -/// Creates and returns a packed id from the given function and object ids. -/// If the ids do not fit within the reserved number of bits for each part, the -/// high bits are truncated. -extern int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId); - /// Initialize the required XRay data structures. This is useful in cases where /// users want to control precisely when the XRay instrumentation data /// structures are initialized, for example when the XRay library is built with diff --git a/compiler-rt/lib/xray/CMakeLists.txt b/compiler-rt/lib/xray/CMakeLists.txt index f38c07420c9a..cf7b5062aae3 100644 --- a/compiler-rt/lib/xray/CMakeLists.txt +++ b/compiler-rt/lib/xray/CMakeLists.txt @@ -10,10 +10,6 @@ set(XRAY_SOURCES xray_utils.cpp ) -set(XRAY_DSO_SOURCES - xray_dso_init.cpp - ) - # Implementation files for all XRay modes. set(XRAY_FDR_MODE_SOURCES xray_fdr_flags.cpp @@ -37,11 +33,6 @@ set(x86_64_SOURCES xray_trampoline_x86_64.S ) -set(x86_64_DSO_SOURCES - xray_trampoline_x86_64.S - ) - - set(arm_SOURCES xray_arm.cpp xray_trampoline_arm.S @@ -137,12 +128,10 @@ set(XRAY_IMPL_HEADERS # consumption by tests. set(XRAY_ALL_SOURCE_FILES ${XRAY_SOURCES} - ${XRAY_DSO_SOURCES} ${XRAY_FDR_MODE_SOURCES} ${XRAY_BASIC_MODE_SOURCES} ${XRAY_PROFILING_MODE_SOURCES} ${x86_64_SOURCES} - ${x86_64_DSO_SOURCES} ${arm_SOURCES} ${armhf_SOURCES} ${hexagon_SOURCES} @@ -173,9 +162,6 @@ set(XRAY_CFLAGS ${COMPILER_RT_CXX_CFLAGS}) set(XRAY_COMMON_DEFINITIONS SANITIZER_COMMON_NO_REDEFINE_BUILTINS XRAY_HAS_EXCEPTIONS=1) -# DSO trampolines need to be compiled with GOT addressing -set(XRAY_COMMON_DEFINITIONS_DSO ${XRAY_COMMON_DEFINITIONS} XRAY_PIC) - # Too many existing bugs, needs cleanup. append_list_if(COMPILER_RT_HAS_WNO_FORMAT -Wno-format XRAY_CFLAGS) @@ -215,16 +201,7 @@ if (APPLE) CFLAGS ${XRAY_CFLAGS} DEFS ${XRAY_COMMON_DEFINITIONS} DEPS ${XRAY_DEPS}) - add_compiler_rt_object_libraries(RTXrayDSO - OS ${XRAY_SUPPORTED_OS} - ARCHS ${XRAY_DSO_SUPPORTED_ARCH} - SOURCES ${XRAY_DSO_SOURCES} - ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} - CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS_DSO} - DEPS ${XRAY_DEPS}) set(XRAY_RTXRAY_ARCH_LIBS "") - set(XRAY_DSO_RTXRAY_ARCH_LIBS "") foreach(arch ${XRAY_SUPPORTED_ARCH}) if(NOT ${arch} IN_LIST XRAY_SOURCE_ARCHS) continue() @@ -238,17 +215,6 @@ if (APPLE) DEFS ${XRAY_COMMON_DEFINITIONS} DEPS ${XRAY_DEPS}) list(APPEND XRAY_RTXRAY_ARCH_LIBS RTXray_${arch}) - if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH) - add_compiler_rt_object_libraries(RTXrayDSO_${arch} - OS ${XRAY_SUPPORTED_OS} - ARCHS ${XRAY_DSO_SUPPORTED_ARCH} - SOURCES ${${arch}_DSO_SOURCES} - ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} - CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS_DSO} - DEPS ${XRAY_DEPS}) - list(APPEND XRAY_DSO_RTXRAY_ARCH_LIBS RTXrayDSO_${arch}) - endif() endforeach() add_compiler_rt_object_libraries(RTXrayFDR OS ${XRAY_SUPPORTED_OS} @@ -286,17 +252,6 @@ if (APPLE) LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} LINK_LIBS ${XRAY_LINK_LIBS} PARENT_TARGET xray) - add_compiler_rt_runtime(clang_rt.xray-dso - STATIC - OS ${XRAY_SUPPORTED_OS} - ARCHS ${XRAY_DSO_SUPPORTED_ARCH} - OBJECT_LIBS RTXrayDSO ${XRAY_DSO_RTXRAY_ARCH_LIBS} - CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS} - LINK_FLAGS ${XRAY_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} - LINK_LIBS ${XRAY_LINK_LIBS} - PARENT_TARGET xray) - add_compiler_rt_runtime(clang_rt.xray-fdr STATIC OS ${XRAY_SUPPORTED_OS} @@ -391,37 +346,16 @@ else() # not Apple DEFS ${XRAY_COMMON_DEFINITIONS} OBJECT_LIBS RTXrayBASIC PARENT_TARGET xray) - # Profiler Mode runtime - add_compiler_rt_runtime(clang_rt.xray-profiling - STATIC - ARCHS ${arch} - CFLAGS ${XRAY_CFLAGS} - LINK_FLAGS ${XRAY_LINK_FLAGS} - LINK_LIBS ${XRAY_LINK_LIBS} - DEFS ${XRAY_COMMON_DEFINITIONS} - OBJECT_LIBS RTXrayPROFILING - PARENT_TARGET xray) - - if (${arch} IN_LIST XRAY_DSO_SUPPORTED_ARCH) - # TODO: Only implemented for X86 at the moment - add_compiler_rt_object_libraries(RTXrayDSO - ARCHS ${arch} - SOURCES ${XRAY_DSO_SOURCES} ${${arch}_DSO_SOURCES} - ADDITIONAL_HEADERS ${XRAY_IMPL_HEADERS} - CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS_DSO} - DEPS ${XRAY_DEPS}) - # DSO runtime archive - add_compiler_rt_runtime(clang_rt.xray-dso - STATIC - ARCHS ${arch} - CFLAGS ${XRAY_CFLAGS} - LINK_FLAGS ${XRAY_LINK_FLAGS} - LINK_LIBS ${XRAY_LINK_LIBS} - DEFS ${XRAY_COMMON_DEFINITIONS} - OBJECT_LIBS RTXrayDSO - PARENT_TARGET xray) - endif() + # Profiler Mode runtime + add_compiler_rt_runtime(clang_rt.xray-profiling + STATIC + ARCHS ${arch} + CFLAGS ${XRAY_CFLAGS} + LINK_FLAGS ${XRAY_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + DEFS ${XRAY_COMMON_DEFINITIONS} + OBJECT_LIBS RTXrayPROFILING + PARENT_TARGET xray) endforeach() endif() # not Apple diff --git a/compiler-rt/lib/xray/xray_dso_init.cpp b/compiler-rt/lib/xray/xray_dso_init.cpp deleted file mode 100644 index eb754db54c64..000000000000 --- a/compiler-rt/lib/xray/xray_dso_init.cpp +++ /dev/null @@ -1,62 +0,0 @@ -//===-- xray_init.cpp -------------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file is a part of XRay, a dynamic runtime instrumentation system. -// -// XRay initialisation logic for DSOs. -//===----------------------------------------------------------------------===// - -#include "sanitizer_common/sanitizer_atomic.h" -#include "xray_defs.h" -#include "xray_flags.h" -#include "xray_interface_internal.h" - -using namespace __sanitizer; - -extern "C" { -extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak)) -__attribute__((visibility("hidden"))); -extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak)) -__attribute__((visibility("hidden"))); -extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)) -__attribute__((visibility("hidden"))); -extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)) -__attribute__((visibility("hidden"))); - -#if SANITIZER_APPLE -// HACK: This is a temporary workaround to make XRay build on -// Darwin, but it will probably not work at runtime. -extern const XRaySledEntry __start_xray_instr_map[] = {}; -extern const XRaySledEntry __stop_xray_instr_map[] = {}; -extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {}; -extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {}; -#endif -} - -// Handler functions to call in the patched entry/exit sled. -extern atomic_uintptr_t XRayPatchedFunction; -extern atomic_uintptr_t XRayArgLogger; -extern atomic_uintptr_t XRayPatchedCustomEvent; -extern atomic_uintptr_t XRayPatchedTypedEvent; - -static int __xray_object_id{-1}; - -// Note: .preinit_array initialization does not work for DSOs -__attribute__((constructor(0))) static void -__xray_init_dso() XRAY_NEVER_INSTRUMENT { - // Register sleds in main XRay runtime. - __xray_object_id = - __xray_register_dso(__start_xray_instr_map, __stop_xray_instr_map, - __start_xray_fn_idx, __stop_xray_fn_idx, {}); -} - -__attribute__((destructor(0))) static void -__xray_finalize_dso() XRAY_NEVER_INSTRUMENT { - // Inform the main runtime that this DSO is no longer used. - __xray_deregister_dso(__xray_object_id); -} diff --git a/compiler-rt/lib/xray/xray_init.cpp b/compiler-rt/lib/xray/xray_init.cpp index 53c93be89cd1..f22a31b95686 100644 --- a/compiler-rt/lib/xray/xray_init.cpp +++ b/compiler-rt/lib/xray/xray_init.cpp @@ -16,8 +16,6 @@ #include #include "sanitizer_common/sanitizer_common.h" -#include "xray/xray_interface.h" -#include "xray_allocator.h" #include "xray_defs.h" #include "xray_flags.h" #include "xray_interface_internal.h" @@ -30,7 +28,7 @@ extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)); extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)); #if SANITIZER_APPLE -// HACK: This is a temporary workaround to make XRay build on +// HACK: This is a temporary workaround to make XRay build on // Darwin, but it will probably not work at runtime. const XRaySledEntry __start_xray_instr_map[] = {}; extern const XRaySledEntry __stop_xray_instr_map[] = {}; @@ -45,16 +43,14 @@ using namespace __xray; // the weak symbols defined above (__start_xray_inst_map and // __stop_xray_instr_map) to initialise the instrumentation map that XRay uses // for runtime patching/unpatching of instrumentation points. +// +// FIXME: Support DSO instrumentation maps too. The current solution only works +// for statically linked executables. atomic_uint8_t XRayInitialized{0}; // This should always be updated before XRayInitialized is updated. SpinMutex XRayInstrMapMutex; - -// Contains maps for the main executable as well as DSOs. -XRaySledMap *XRayInstrMaps; - -// Number of binary objects registered. -atomic_uint32_t XRayNumObjects{0}; +XRaySledMap XRayInstrMap; // Global flag to determine whether the flags have been initialized. atomic_uint8_t XRayFlagsInitialized{0}; @@ -62,63 +58,6 @@ atomic_uint8_t XRayFlagsInitialized{0}; // A mutex to allow only one thread to initialize the XRay data structures. SpinMutex XRayInitMutex; -// Registers XRay sleds and trampolines coming from the main executable or one -// of the linked DSOs. -// Returns the object ID if registration is successful, -1 otherwise. -int32_t -__xray_register_sleds(const XRaySledEntry *SledsBegin, - const XRaySledEntry *SledsEnd, - const XRayFunctionSledIndex *FnIndexBegin, - const XRayFunctionSledIndex *FnIndexEnd, bool FromDSO, - XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT { - if (!SledsBegin || !SledsEnd) { - Report("Invalid XRay sleds.\n"); - return -1; - } - XRaySledMap SledMap; - SledMap.FromDSO = FromDSO; - SledMap.Loaded = true; - SledMap.Trampolines = Trampolines; - SledMap.Sleds = SledsBegin; - SledMap.Entries = SledsEnd - SledsBegin; - if (FnIndexBegin != nullptr) { - SledMap.SledsIndex = FnIndexBegin; - SledMap.Functions = FnIndexEnd - FnIndexBegin; - } else { - size_t CountFunctions = 0; - uint64_t LastFnAddr = 0; - - for (std::size_t I = 0; I < SledMap.Entries; I++) { - const auto &Sled = SledMap.Sleds[I]; - const auto Function = Sled.function(); - if (Function != LastFnAddr) { - CountFunctions++; - LastFnAddr = Function; - } - } - SledMap.SledsIndex = nullptr; - SledMap.Functions = CountFunctions; - } - if (SledMap.Functions >= XRayMaxFunctions) { - Report("Too many functions! Maximum is %ld\n", XRayMaxFunctions); - return -1; - } - - if (Verbosity()) - Report("Registering %d new functions!\n", SledMap.Functions); - - { - SpinMutexLock Guard(&XRayInstrMapMutex); - auto Idx = atomic_fetch_add(&XRayNumObjects, 1, memory_order_acq_rel); - if (Idx >= XRayMaxObjects) { - Report("Too many objects registered! Maximum is %ld\n", XRayMaxObjects); - return -1; - } - XRayInstrMaps[Idx] = std::move(SledMap); - return Idx; - } -} - // __xray_init() will do the actual loading of the current process' memory map // and then proceed to look for the .xray_instr_map section/segment. void __xray_init() XRAY_NEVER_INSTRUMENT { @@ -141,21 +80,29 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { return; } - atomic_store(&XRayNumObjects, 0, memory_order_release); - - // Pre-allocation takes up approx. 5kB for XRayMaxObjects=64. - XRayInstrMaps = allocateBuffer(XRayMaxObjects); - - int MainBinaryId = - __xray_register_sleds(__start_xray_instr_map, __stop_xray_instr_map, - __start_xray_fn_idx, __stop_xray_fn_idx, false, {}); + { + SpinMutexLock Guard(&XRayInstrMapMutex); + XRayInstrMap.Sleds = __start_xray_instr_map; + XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map; + if (__start_xray_fn_idx != nullptr) { + XRayInstrMap.SledsIndex = __start_xray_fn_idx; + XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx; + } else { + size_t CountFunctions = 0; + uint64_t LastFnAddr = 0; + + for (std::size_t I = 0; I < XRayInstrMap.Entries; I++) { + const auto &Sled = XRayInstrMap.Sleds[I]; + const auto Function = Sled.function(); + if (Function != LastFnAddr) { + CountFunctions++; + LastFnAddr = Function; + } + } - // The executable should always get ID 0. - if (MainBinaryId != 0) { - Report("Registering XRay sleds failed.\n"); - return; + XRayInstrMap.Functions = CountFunctions; + } } - atomic_store(&XRayInitialized, true, memory_order_release); #ifndef XRAY_NO_PREINIT @@ -164,84 +111,6 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { #endif } -// Registers XRay sleds and trampolines of an instrumented DSO. -// Returns the object ID if registration is successful, -1 otherwise. -// -// Default visibility is hidden, so we have to explicitly make it visible to -// DSO. -SANITIZER_INTERFACE_ATTRIBUTE int32_t __xray_register_dso( - const XRaySledEntry *SledsBegin, const XRaySledEntry *SledsEnd, - const XRayFunctionSledIndex *FnIndexBegin, - const XRayFunctionSledIndex *FnIndexEnd, - XRayTrampolines Trampolines) XRAY_NEVER_INSTRUMENT { - // Make sure XRay has been initialized in the main executable. - __xray_init(); - - if (__xray_num_objects() == 0) { - if (Verbosity()) - Report("No XRay instrumentation map in main executable. Not initializing " - "XRay for DSO.\n"); - return -1; - } - - // Register sleds in global map. - int ObjId = __xray_register_sleds(SledsBegin, SledsEnd, FnIndexBegin, - FnIndexEnd, true, Trampolines); - -#ifndef XRAY_NO_PREINIT - if (ObjId >= 0 && flags()->patch_premain) - __xray_patch_object(ObjId); -#endif - - return ObjId; -} - -// Deregisters a DSO from the main XRay runtime. -// Called from the DSO-local runtime when the library is unloaded (e.g. if -// dlclose is called). -// Returns true if the object ID is valid and the DSO was successfully -// deregistered. -SANITIZER_INTERFACE_ATTRIBUTE bool -__xray_deregister_dso(int32_t ObjId) XRAY_NEVER_INSTRUMENT { - - if (!atomic_load(&XRayInitialized, memory_order_acquire)) { - if (Verbosity()) - Report("XRay has not been initialized. Cannot deregister DSO.\n"); - return false; - } - - if (ObjId <= 0 || ObjId >= __xray_num_objects()) { - if (Verbosity()) - Report("Can't deregister object with ID %d: ID is invalid.\n", ObjId); - return false; - } - - { - SpinMutexLock Guard(&XRayInstrMapMutex); - auto &Entry = XRayInstrMaps[ObjId]; - if (!Entry.FromDSO) { - if (Verbosity()) - Report("Can't deregister object with ID %d: object does not correspond " - "to a shared library.\n", - ObjId); - return false; - } - if (!Entry.Loaded) { - if (Verbosity()) - Report("Can't deregister object with ID %d: object is not loaded.\n", - ObjId); - return true; - } - // Mark DSO as unloaded. No need to unpatch. - Entry.Loaded = false; - } - - if (Verbosity()) - Report("Deregistered object with ID %d.\n", ObjId); - - return true; -} - // FIXME: Make check-xray tests work on FreeBSD without // SANITIZER_CAN_USE_PREINIT_ARRAY. // See sanitizer_internal_defs.h where the macro is defined. diff --git a/compiler-rt/lib/xray/xray_interface.cpp b/compiler-rt/lib/xray/xray_interface.cpp index 402fc3d07b4e..5839043fcb93 100644 --- a/compiler-rt/lib/xray/xray_interface.cpp +++ b/compiler-rt/lib/xray/xray_interface.cpp @@ -36,8 +36,7 @@ extern __sanitizer::SpinMutex XRayInstrMapMutex; extern __sanitizer::atomic_uint8_t XRayInitialized; -extern __xray::XRaySledMap *XRayInstrMaps; -extern __sanitizer::atomic_uint32_t XRayNumObjects; +extern __xray::XRaySledMap XRayInstrMap; namespace __xray { @@ -62,16 +61,16 @@ static const int16_t cSledLength = 20; #endif /* CPU architecture */ // This is the function to call when we encounter the entry or exit sleds. -atomic_uintptr_t XRayPatchedFunction SANITIZER_INTERFACE_ATTRIBUTE{0}; +atomic_uintptr_t XRayPatchedFunction{0}; // This is the function to call from the arg1-enabled sleds/trampolines. -atomic_uintptr_t XRayArgLogger SANITIZER_INTERFACE_ATTRIBUTE{0}; +atomic_uintptr_t XRayArgLogger{0}; // This is the function to call when we encounter a custom event log call. -atomic_uintptr_t XRayPatchedCustomEvent SANITIZER_INTERFACE_ATTRIBUTE{0}; +atomic_uintptr_t XRayPatchedCustomEvent{0}; // This is the function to call when we encounter a typed event log call. -atomic_uintptr_t XRayPatchedTypedEvent SANITIZER_INTERFACE_ATTRIBUTE{0}; +atomic_uintptr_t XRayPatchedTypedEvent{0}; // This is the global status to determine whether we are currently // patching/unpatching. @@ -151,42 +150,27 @@ public: namespace { -bool isObjectLoaded(int32_t ObjId) { - SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || - ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { - return false; - } - return XRayInstrMaps[ObjId].Loaded; -} - -bool patchSled(const XRaySledEntry &Sled, bool Enable, int32_t FuncId, - const XRayTrampolines &Trampolines) XRAY_NEVER_INSTRUMENT { +bool patchSled(const XRaySledEntry &Sled, bool Enable, + int32_t FuncId) XRAY_NEVER_INSTRUMENT { bool Success = false; switch (Sled.Kind) { case XRayEntryType::ENTRY: - Success = - patchFunctionEntry(Enable, FuncId, Sled, Trampolines.EntryTrampoline); + Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry); break; case XRayEntryType::EXIT: - Success = - patchFunctionExit(Enable, FuncId, Sled, Trampolines.ExitTrampoline); + Success = patchFunctionExit(Enable, FuncId, Sled); break; case XRayEntryType::TAIL: - Success = patchFunctionTailExit(Enable, FuncId, Sled, - Trampolines.TailExitTrampoline); + Success = patchFunctionTailExit(Enable, FuncId, Sled); break; case XRayEntryType::LOG_ARGS_ENTRY: - Success = - patchFunctionEntry(Enable, FuncId, Sled, Trampolines.LogArgsTrampoline); + Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry); break; case XRayEntryType::CUSTOM_EVENT: - Success = patchCustomEvent(Enable, FuncId, Sled, - Trampolines.CustomEventTrampoline); + Success = patchCustomEvent(Enable, FuncId, Sled); break; case XRayEntryType::TYPED_EVENT: - Success = - patchTypedEvent(Enable, FuncId, Sled, Trampolines.TypedEventTrampoline); + Success = patchTypedEvent(Enable, FuncId, Sled); break; default: Report("Unsupported sled kind '%" PRIu64 "' @%04x\n", Sled.Address, @@ -221,9 +205,10 @@ findFunctionSleds(int32_t FuncId, return Index; } -XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, +XRayPatchingStatus patchFunction(int32_t FuncId, bool Enable) XRAY_NEVER_INSTRUMENT { - if (!atomic_load(&XRayInitialized, memory_order_acquire)) + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. uint8_t NotPatching = false; @@ -235,24 +220,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || - ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { - Report("Unable to patch function: invalid sled map index: %d", ObjId); - return XRayPatchingStatus::FAILED; - } - InstrMap = XRayInstrMaps[ObjId]; + InstrMap = XRayInstrMap; } // If we don't have an index, we can't patch individual functions. if (InstrMap.Functions == 0) return XRayPatchingStatus::NOT_INITIALIZED; - // Check if the corresponding DSO has been unloaded. - if (!InstrMap.Loaded) { - Report("Invalid function id provided: %d\n", FuncId); - return XRayPatchingStatus::NOT_INITIALIZED; - } - // FuncId must be a positive number, less than the number of functions // instrumented. if (FuncId <= 0 || static_cast(FuncId) > InstrMap.Functions) { @@ -260,8 +234,6 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, return XRayPatchingStatus::FAILED; } - auto PackedId = __xray::MakePackedId(FuncId, ObjId); - // Now we patch ths sleds for this specific function. XRayFunctionSledIndex SledRange; if (InstrMap.SledsIndex) { @@ -270,13 +242,13 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, } else { SledRange = findFunctionSleds(FuncId, InstrMap); } - auto *f = SledRange.Begin; bool SucceedOnce = false; for (size_t i = 0; i != SledRange.Size; ++i) - SucceedOnce |= patchSled(f[i], Enable, PackedId, InstrMap.Trampolines); + SucceedOnce |= patchSled(f[i], Enable, FuncId); - atomic_store(&XRayPatching, false, memory_order_release); + atomic_store(&XRayPatching, false, + memory_order_release); if (!SucceedOnce) { Report("Failed patching any sled for function '%d'.", FuncId); @@ -289,31 +261,32 @@ XRayPatchingStatus patchFunction(int32_t FuncId, int32_t ObjId, // controlPatching implements the common internals of the patching/unpatching // implementation. |Enable| defines whether we're enabling or disabling the // runtime XRay instrumentation. -// This function should only be called after ensuring that XRay is initialized -// and no other thread is currently patching. -XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) { +XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) + return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. + + uint8_t NotPatching = false; + if (!atomic_compare_exchange_strong( + &XRayPatching, &NotPatching, true, memory_order_acq_rel)) + return XRayPatchingStatus::ONGOING; // Already patching. + + uint8_t PatchingSuccess = false; + auto XRayPatchingStatusResetter = + at_scope_exit([&PatchingSuccess] { + if (!PatchingSuccess) + atomic_store(&XRayPatching, false, + memory_order_release); + }); + XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || - ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { - Report("Unable to patch functions: invalid sled map index: %d\n", ObjId); - return XRayPatchingStatus::FAILED; - } - InstrMap = XRayInstrMaps[ObjId]; + InstrMap = XRayInstrMap; } if (InstrMap.Entries == 0) return XRayPatchingStatus::NOT_INITIALIZED; - if (Verbosity()) - Report("Patching object %d with %d functions.\n", ObjId, InstrMap.Entries); - - // Check if the corresponding DSO has been unloaded. - if (!InstrMap.Loaded) { - Report("Object is not loaded at index: %d\n", ObjId); - return XRayPatchingStatus::FAILED; - } - uint32_t FuncId = 1; uint64_t CurFun = 0; @@ -363,96 +336,20 @@ XRayPatchingStatus controlPatchingObjectUnchecked(bool Enable, int32_t ObjId) { ++FuncId; CurFun = F; } - auto PackedId = __xray::MakePackedId(FuncId, ObjId); - patchSled(Sled, Enable, PackedId, InstrMap.Trampolines); + patchSled(Sled, Enable, FuncId); } - atomic_store(&XRayPatching, false, memory_order_release); + atomic_store(&XRayPatching, false, + memory_order_release); + PatchingSuccess = true; return XRayPatchingStatus::SUCCESS; } -// Controls patching for all registered objects. -// Returns: SUCCESS, if patching succeeds for all objects. -// NOT_INITIALIZED, if one or more objects returned NOT_INITIALIZED -// but none failed. -// FAILED, if patching of one or more objects failed. -XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { - if (!atomic_load(&XRayInitialized, memory_order_acquire)) - return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. - - uint8_t NotPatching = false; - if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true, - memory_order_acq_rel)) - return XRayPatchingStatus::ONGOING; // Already patching. - - auto XRayPatchingStatusResetter = at_scope_exit( - [] { atomic_store(&XRayPatching, false, memory_order_release); }); - - unsigned NumObjects = __xray_num_objects(); - - XRayPatchingStatus CombinedStatus{NOT_INITIALIZED}; - for (unsigned I = 0; I < NumObjects; ++I) { - if (!isObjectLoaded(I)) - continue; - auto LastStatus = controlPatchingObjectUnchecked(Enable, I); - switch (LastStatus) { - case SUCCESS: - if (CombinedStatus == NOT_INITIALIZED) - CombinedStatus = SUCCESS; - break; - case FAILED: - // Report failure, but try to patch the remaining objects - CombinedStatus = FAILED; - break; - case NOT_INITIALIZED: - // XRay has been initialized but there are no sleds available for this - // object. Try to patch remaining objects. - if (CombinedStatus != FAILED) - CombinedStatus = NOT_INITIALIZED; - break; - case ONGOING: - UNREACHABLE("Status ONGOING should not appear at this point"); - default: - UNREACHABLE("Unhandled patching status"); - } - } - return CombinedStatus; -} - -// Controls patching for one object. -XRayPatchingStatus controlPatching(bool Enable, - int32_t ObjId) XRAY_NEVER_INSTRUMENT { - - if (!atomic_load(&XRayInitialized, memory_order_acquire)) - return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. - - uint8_t NotPatching = false; - if (!atomic_compare_exchange_strong(&XRayPatching, &NotPatching, true, - memory_order_acq_rel)) - return XRayPatchingStatus::ONGOING; // Already patching. - - auto XRayPatchingStatusResetter = at_scope_exit( - [] { atomic_store(&XRayPatching, false, memory_order_release); }); - - return controlPatchingObjectUnchecked(Enable, ObjId); -} - -XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId, +XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, bool Enable) XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || - ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) { - Report("Unable to patch function: invalid sled map index: %d\n", ObjId); - return XRayPatchingStatus::FAILED; - } - InstrMap = XRayInstrMaps[ObjId]; - } - - // Check if the corresponding DSO has been unloaded. - if (!InstrMap.Loaded) { - Report("Object is not loaded at index: %d\n", ObjId); - return XRayPatchingStatus::FAILED; + InstrMap = XRayInstrMap; } // FuncId must be a positive number, less than the number of functions @@ -501,7 +398,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, int32_t ObjId, Report("Failed mprotect: %d\n", errno); return XRayPatchingStatus::FAILED; } - return patchFunction(FuncId, ObjId, Enable); + return patchFunction(FuncId, Enable); } } // namespace @@ -515,10 +412,12 @@ using namespace __xray; int __xray_set_handler(void (*entry)(int32_t, XRayEntryType)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, memory_order_acquire)) { + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { atomic_store(&__xray::XRayPatchedFunction, - reinterpret_cast(entry), memory_order_release); + reinterpret_cast(entry), + memory_order_release); return 1; } return 0; @@ -526,9 +425,11 @@ int __xray_set_handler(void (*entry)(int32_t, int __xray_set_customevent_handler(void (*entry)(void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, memory_order_acquire)) { + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { atomic_store(&__xray::XRayPatchedCustomEvent, - reinterpret_cast(entry), memory_order_release); + reinterpret_cast(entry), + memory_order_release); return 1; } return 0; @@ -536,9 +437,11 @@ int __xray_set_customevent_handler(void (*entry)(void *, size_t)) int __xray_set_typedevent_handler(void (*entry)(size_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT { - if (atomic_load(&XRayInitialized, memory_order_acquire)) { + if (atomic_load(&XRayInitialized, + memory_order_acquire)) { atomic_store(&__xray::XRayPatchedTypedEvent, - reinterpret_cast(entry), memory_order_release); + reinterpret_cast(entry), + memory_order_release); return 1; } return 0; @@ -571,78 +474,39 @@ XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT { return controlPatching(true); } -XRayPatchingStatus __xray_patch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { - return controlPatching(true, ObjId); -} - XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT { return controlPatching(false); } -XRayPatchingStatus __xray_unpatch_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { - return controlPatching(false, ObjId); -} - XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - auto Ids = __xray::UnpackId(FuncId); - auto ObjId = Ids.first; - auto FnId = Ids.second; - return mprotectAndPatchFunction(FnId, ObjId, true); -} - -XRayPatchingStatus -__xray_patch_function_in_object(int32_t FuncId, - int32_t ObjId) XRAY_NEVER_INSTRUMENT { - return mprotectAndPatchFunction(FuncId, ObjId, true); + return mprotectAndPatchFunction(FuncId, true); } XRayPatchingStatus __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - auto Ids = __xray::UnpackId(FuncId); - auto ObjId = Ids.first; - auto FnId = Ids.second; - return mprotectAndPatchFunction(FnId, ObjId, false); -} - -XRayPatchingStatus -__xray_unpatch_function_in_object(int32_t FuncId, - int32_t ObjId) XRAY_NEVER_INSTRUMENT { - return mprotectAndPatchFunction(FuncId, ObjId, false); + return mprotectAndPatchFunction(FuncId, false); } int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) { - if (!atomic_load(&XRayInitialized, memory_order_acquire)) + if (!atomic_load(&XRayInitialized, + memory_order_acquire)) return 0; // A relaxed write might not be visible even if the current thread gets // scheduled on a different CPU/NUMA node. We need to wait for everyone to // have this handler installed for consistency of collected data across CPUs. atomic_store(&XRayArgLogger, reinterpret_cast(entry), - memory_order_release); + memory_order_release); return 1; } int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); } -uintptr_t -__xray_function_address(int32_t CombinedFuncId) XRAY_NEVER_INSTRUMENT { - auto Ids = __xray::UnpackId(CombinedFuncId); - return __xray_function_address_in_object(Ids.second, Ids.first); -} - -uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId) - XRAY_NEVER_INSTRUMENT { +uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { SpinMutexLock Guard(&XRayInstrMapMutex); - auto count = atomic_load(&XRayNumObjects, memory_order_acquire); - if (ObjId < 0 || ObjId >= count) { - Report("Unable to determine function address: invalid sled map index %d " - "(size is %d)\n", - ObjId, (int)count); - return 0; - } - InstrMap = XRayInstrMaps[ObjId]; + InstrMap = XRayInstrMap; } if (FuncId <= 0 || static_cast(FuncId) > InstrMap.Functions) @@ -661,29 +525,6 @@ uintptr_t __xray_function_address_in_object(int32_t FuncId, int32_t ObjId) } size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT { - return __xray_max_function_id_in_object(0); -} - -size_t __xray_max_function_id_in_object(int32_t ObjId) XRAY_NEVER_INSTRUMENT { - SpinMutexLock Guard(&XRayInstrMapMutex); - if (ObjId < 0 || ObjId >= atomic_load(&XRayNumObjects, memory_order_acquire)) - return 0; - return XRayInstrMaps[ObjId].Functions; -} - -size_t __xray_num_objects() XRAY_NEVER_INSTRUMENT { SpinMutexLock Guard(&XRayInstrMapMutex); - return atomic_load(&XRayNumObjects, memory_order_acquire); -} - -int32_t __xray_unpack_function_id(int32_t PackedId) { - return __xray::UnpackId(PackedId).second; -} - -int32_t __xray_unpack_object_id(int32_t PackedId) { - return __xray::UnpackId(PackedId).first; -} - -int32_t __xray_pack_id(int32_t FuncId, int32_t ObjId) { - return __xray::MakePackedId(FuncId, ObjId); + return XRayInstrMap.Functions; } diff --git a/compiler-rt/lib/xray/xray_interface_internal.h b/compiler-rt/lib/xray/xray_interface_internal.h index 5fbaa9c3f315..80c07c167f64 100644 --- a/compiler-rt/lib/xray/xray_interface_internal.h +++ b/compiler-rt/lib/xray/xray_interface_internal.h @@ -18,18 +18,6 @@ #include "xray/xray_interface.h" #include #include -#include - -extern "C" { -// The following functions have to be defined in assembler, on a per-platform -// basis. See xray_trampoline_*.S files for implementations. -extern void __xray_FunctionEntry(); -extern void __xray_FunctionExit(); -extern void __xray_FunctionTailExit(); -extern void __xray_ArgLoggerEntry(); -extern void __xray_CustomEvent(); -extern void __xray_TypedEvent(); -} extern "C" { @@ -79,77 +67,36 @@ struct XRayFunctionSledIndex { uintptr_t(Begin)); } }; - -struct XRayTrampolines { - void (*EntryTrampoline)(); - void (*ExitTrampoline)(); - void (*TailExitTrampoline)(); - void (*LogArgsTrampoline)(); - void (*CustomEventTrampoline)(); - void (*TypedEventTrampoline)(); - - XRayTrampolines() { - // These resolve to the definitions in the respective executable or DSO. - EntryTrampoline = __xray_FunctionEntry; - ExitTrampoline = __xray_FunctionExit; - TailExitTrampoline = __xray_FunctionTailExit; - LogArgsTrampoline = __xray_ArgLoggerEntry; - CustomEventTrampoline = __xray_CustomEvent; - TypedEventTrampoline = __xray_TypedEvent; - } -}; - -extern int32_t __xray_register_dso(const XRaySledEntry *SledsBegin, - const XRaySledEntry *SledsEnd, - const XRayFunctionSledIndex *FnIndexBegin, - const XRayFunctionSledIndex *FnIndexEnd, - XRayTrampolines Trampolines); - -extern bool __xray_deregister_dso(int32_t ObjId); } namespace __xray { -constexpr uint32_t XRayNFnBits = 24; -constexpr uint32_t XRayNObjBits = 8; - -constexpr uint32_t XRayFnBitMask = 0x00FFFFFF; -constexpr uint32_t XRayObjBitMask = 0xFF000000; - -constexpr size_t XRayMaxFunctions = 1 << XRayNFnBits; -constexpr size_t XRayMaxObjects = 1 << XRayNObjBits; - -inline int32_t MakePackedId(int32_t FnId, int32_t ObjId) { - return ((ObjId << XRayNFnBits) & XRayObjBitMask) | (FnId & XRayFnBitMask); -} - -inline std::pair UnpackId(int32_t PackedId) { - uint32_t ObjId = (PackedId & XRayObjBitMask) >> XRayNFnBits; - uint32_t FnId = PackedId & XRayFnBitMask; - return {ObjId, FnId}; -} - struct XRaySledMap { const XRaySledEntry *Sleds; size_t Entries; const XRayFunctionSledIndex *SledsIndex; size_t Functions; - XRayTrampolines Trampolines; - bool FromDSO; - bool Loaded; }; bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, void (*Trampoline)()); -bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, - void (*Trampoline)()); +bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); bool patchFunctionTailExit(bool Enable, uint32_t FuncId, - const XRaySledEntry &Sled, void (*Trampoline)()); -bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, - void (*Trampoline)()); -bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled, - void (*Trampoline)()); + const XRaySledEntry &Sled); +bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); +bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled); } // namespace __xray +extern "C" { +// The following functions have to be defined in assembler, on a per-platform +// basis. See xray_trampoline_*.S files for implementations. +extern void __xray_FunctionEntry(); +extern void __xray_FunctionExit(); +extern void __xray_FunctionTailExit(); +extern void __xray_ArgLoggerEntry(); +extern void __xray_CustomEvent(); +extern void __xray_TypedEvent(); +} + #endif diff --git a/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/compiler-rt/lib/xray/xray_trampoline_x86_64.S index 0f480547b52c..01098f60eeab 100644 --- a/compiler-rt/lib/xray/xray_trampoline_x86_64.S +++ b/compiler-rt/lib/xray/xray_trampoline_x86_64.S @@ -107,16 +107,6 @@ .section __TEXT,__text #endif -.macro LOAD_HANDLER_ADDR handler -#if !defined(XRAY_PIC) - movq ASM_SYMBOL(\handler)(%rip), %rax -#else - movq ASM_SYMBOL(\handler)@GOTPCREL(%rip), %rax - movq (%rax), %rax -#endif -.endm - - //===----------------------------------------------------------------------===// .globl ASM_SYMBOL(__xray_FunctionEntry) @@ -131,7 +121,7 @@ ASM_SYMBOL(__xray_FunctionEntry): // This load has to be atomic, it's concurrent with __xray_patch(). // On x86/amd64, a simple (type-aligned) MOV instruction is enough. - LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax, %rax je LOCAL_LABEL(tmp0) @@ -169,7 +159,7 @@ ASM_SYMBOL(__xray_FunctionExit): movupd %xmm1, 16(%rsp) movq %rax, 8(%rsp) movq %rdx, 0(%rsp) - LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax,%rax je LOCAL_LABEL(tmp2) @@ -205,7 +195,7 @@ ASM_SYMBOL(__xray_FunctionTailExit): SAVE_REGISTERS ALIGN_STACK_16B - LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax,%rax je LOCAL_LABEL(tmp4) @@ -234,12 +224,12 @@ ASM_SYMBOL(__xray_ArgLoggerEntry): ALIGN_STACK_16B // Again, these function pointer loads must be atomic; MOV is fine. - LOAD_HANDLER_ADDR _ZN6__xray13XRayArgLoggerE + movq ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)(%rip), %rax testq %rax, %rax jne LOCAL_LABEL(arg1entryLog) // If [arg1 logging handler] not set, defer to no-arg logging. - LOAD_HANDLER_ADDR _ZN6__xray19XRayPatchedFunctionE + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax, %rax je LOCAL_LABEL(arg1entryFail) @@ -278,7 +268,7 @@ ASM_SYMBOL(__xray_CustomEvent): // We take two arguments to this trampoline, which should be in rdi and rsi // already. - LOAD_HANDLER_ADDR _ZN6__xray22XRayPatchedCustomEventE + movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax testq %rax,%rax je LOCAL_LABEL(customEventCleanup) @@ -303,7 +293,7 @@ ASM_SYMBOL(__xray_TypedEvent): // We pass three arguments to this trampoline, which should be in rdi, rsi // and rdx without our intervention. - LOAD_HANDLER_ADDR _ZN6__xray21XRayPatchedTypedEventE + movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax testq %rax,%rax je LOCAL_LABEL(typedEventCleanup) diff --git a/compiler-rt/lib/xray/xray_x86_64.cpp b/compiler-rt/lib/xray/xray_x86_64.cpp index 663a51b26866..b9666a40861d 100644 --- a/compiler-rt/lib/xray/xray_x86_64.cpp +++ b/compiler-rt/lib/xray/xray_x86_64.cpp @@ -170,8 +170,7 @@ bool patchFunctionEntry(const bool Enable, const uint32_t FuncId, } bool patchFunctionExit(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled, - void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -193,11 +192,11 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, // Prerequisite is to compute the relative offset fo the // __xray_FunctionExit function's address. const uint64_t Address = Sled.address(); - int64_t TrampolineOffset = reinterpret_cast(Trampoline) - + int64_t TrampolineOffset = reinterpret_cast(__xray_FunctionExit) - (static_cast(Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Exit trampoline (%p) too far from sled (%p)\n", - reinterpret_cast(Trampoline), + reinterpret_cast(__xray_FunctionExit), reinterpret_cast(Address)); return false; } @@ -218,16 +217,16 @@ bool patchFunctionExit(const bool Enable, const uint32_t FuncId, } bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled, - void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the tail call sled with a similar // sequence as the entry sled, but calls the tail exit sled instead. const uint64_t Address = Sled.address(); - int64_t TrampolineOffset = reinterpret_cast(Trampoline) - - (static_cast(Address) + 11); + int64_t TrampolineOffset = + reinterpret_cast(__xray_FunctionTailExit) - + (static_cast(Address) + 11); if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) { Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n", - reinterpret_cast(Trampoline), + reinterpret_cast(__xray_FunctionTailExit), reinterpret_cast(Address)); return false; } @@ -248,8 +247,7 @@ bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId, } bool patchCustomEvent(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled, - void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: @@ -277,8 +275,7 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, } bool patchTypedEvent(const bool Enable, const uint32_t FuncId, - const XRaySledEntry &Sled, - void (*Trampoline)()) XRAY_NEVER_INSTRUMENT { + const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // // xray_sled_n: diff --git a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp deleted file mode 100644 index 31c615bd1f81..000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/basic-mode-dso.cpp +++ /dev/null @@ -1,47 +0,0 @@ -// Testing shared library support in basic logging mode. - -// RUN: split-file %s %t -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o - -// RUN: XRAY_OPTIONS="patch_premain=false,xray_mode=xray-basic,xray_logfile_base=basic-mode-dso-,verbosity=1" XRAY_BASIC_OPTIONS="func_duration_threshold_us=0" %run %t/main.o 2>&1 | FileCheck %s -// RUN: %llvm_xray account --format=csv --sort=funcid "`ls basic-mode-dso-* | head -1`" | FileCheck --check-prefix=ACCOUNT %s -// RUN: rm basic-mode-dso-* - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include -#include - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); - sleep(1); -} - -extern void instrumented_in_dso(); - -int main() { - // Explicit patching to ensure the DSO has been loaded - __xray_patch(); - instrumented_in_executable(); - // CHECK: instrumented_in_executable called - instrumented_in_dso(); - // CHECK-NEXT: instrumented_in_dso called -} - -//--- testlib.cpp - -#include -#include - -[[clang::xray_always_instrument]] void instrumented_in_dso() { - printf("instrumented_in_dso called\n"); -} - -// ACCOUNT: funcid,count,min,median,90%ile,99%ile,max,sum,debug,function -// ACCOUNT-NEXT: 1,1,{{.*}} -// ACCOUNT-NEXT: 16777217,1,{{.*}} diff --git a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp b/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp deleted file mode 100644 index 92f3c29e970d..000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/clang-xray-shared.cpp +++ /dev/null @@ -1,14 +0,0 @@ -// Test that the DSO-local runtime library has been linked if -fxray-shared is passed. -// -// RUN: %clangxx -fxray-instrument -fxray-shared %s -shared -o %t.so -// RUN: llvm-nm %t.so | FileCheck %s --check-prefix ENABLED - -// RUN: %clangxx -fxray-instrument %s -shared -o %t.so -// RUN: llvm-nm %t.so | FileCheck %s --check-prefix DISABLED -// -// REQUIRES: target=x86_64{{.*}} - -[[clang::xray_always_instrument]] int always_instrumented() { return 42; } - -// ENABLED: __start_xray_instr_map -// DISABLED-NOT: __start_xray_instr_map diff --git a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp b/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp deleted file mode 100644 index 9db411d5ff1c..000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/dlopen.cpp +++ /dev/null @@ -1,107 +0,0 @@ -// Check that we can patch and un-patch DSOs loaded with dlopen. -// - -// RUN: split-file %s %t -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so -// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp -o %t/main.o -// -// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlib.so 2>&1 | FileCheck %s - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include -#include - -void test_handler(int32_t fid, XRayEntryType type) { - printf("called: %d, type=%d\n", fid, static_cast(type)); -} - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); -} - -typedef void (*dso_func_type)(); - -int main(int argc, char **argv) { - if (argc < 2) { - printf("Shared library argument missing\n"); - // CHECK-NOT: Shared library argument missing - return 1; - } - - const char *dso_path = argv[1]; - - void *dso_handle = dlopen(dso_path, RTLD_LAZY); - if (!dso_handle) { - printf("Failed to load shared library\n"); - char *error = dlerror(); - if (error) { - fprintf(stderr, "%s\n", error); - return 1; - } - return 1; - } - - dso_func_type instrumented_in_dso = - (dso_func_type)dlsym(dso_handle, "_Z19instrumented_in_dsov"); - if (!instrumented_in_dso) { - printf("Failed to find symbol\n"); - char *error = dlerror(); - if (error) { - fprintf(stderr, "%s\n", error); - return 1; - } - return 1; - } - - __xray_set_handler(test_handler); - - instrumented_in_executable(); - // CHECK: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{.*}}, type=1 - instrumented_in_dso(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_dso called - // CHECK-NEXT: called: {{.*}}, type=1 - - auto status = __xray_unpatch(); - printf("unpatching status: %d\n", static_cast(status)); - // CHECK-NEXT: unpatching status: 1 - - instrumented_in_executable(); - // CHECK-NEXT: instrumented_in_executable called - instrumented_in_dso(); - // CHECK-NEXT: instrumented_in_dso called - - status = __xray_patch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 - - instrumented_in_executable(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{.*}}, type=1 - instrumented_in_dso(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_dso called - // CHECK-NEXT: called: {{.*}}, type=1 - - dlclose(dso_handle); - - status = __xray_unpatch(); - printf("unpatching status: %d\n", static_cast(status)); - // CHECK-NEXT: unpatching status: 1 -} - -//--- testlib.cpp - -#include - -[[clang::xray_always_instrument]] void instrumented_in_dso() { - printf("instrumented_in_dso called\n"); -} diff --git a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp b/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp deleted file mode 100644 index 89da2764c35c..000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/dso-dep-chains.cpp +++ /dev/null @@ -1,197 +0,0 @@ -// Check that loading libraries with different modes (RTLD_LOCAL/RTLD_GLOBAL) -// and dependencies on other DSOs work correctly. -// - -// RUN: split-file %s %t -// -// Build shared libs with dependencies b->c and e->f -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testliba.cpp -o %t/testliba.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibc.cpp -o %t/testlibc.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibb.cpp %t/testlibc.so -o %t/testlibb.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibd.cpp -o %t/testlibd.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibf.cpp -o %t/testlibf.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlibe.cpp %t/testlibf.so -o %t/testlibe.so -// -// Executable links with a and b explicitly and loads d and e at runtime. -// RUN: %clangxx_xray -g -fPIC -rdynamic -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testliba.so %t/testlibb.so -o %t/main.o -// -// RUN: XRAY_OPTIONS="patch_premain=true" %run %t/main.o %t/testlibd.so %t/testlibe.so 2>&1 | FileCheck %s - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include -#include - -[[clang::xray_never_instrument]] void test_handler(int32_t fid, - XRayEntryType type) { - printf("called: %d, object=%d, fn=%d, type=%d\n", fid, (fid >> 24) & 0xFF, - fid & 0x00FFFFFF, static_cast(type)); -} - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); -} - -typedef void (*dso_func_type)(); - -[[clang::xray_never_instrument]] void *load_dso(const char *path, int mode) { - void *dso_handle = dlopen(path, mode); - if (!dso_handle) { - printf("failed to load shared library\n"); - char *error = dlerror(); - if (error) { - fprintf(stderr, "%s\n", error); - } - return nullptr; - } - return dso_handle; -} - -[[clang::xray_never_instrument]] void find_and_call(void *dso_handle, - const char *fn) { - dso_func_type dso_fn = (dso_func_type)dlsym(dso_handle, fn); - if (!dso_fn) { - printf("failed to find symbol\n"); - char *error = dlerror(); - if (error) { - fprintf(stderr, "%s\n", error); - } - return; - } - dso_fn(); -} - -extern void a(); -extern void b(); - -int main(int argc, char **argv) { - - if (argc < 3) { - printf("Shared library arguments missing\n"); - // CHECK-NOT: Shared library arguments missing - return 1; - } - - const char *dso_path_d = argv[1]; - const char *dso_path_e = argv[2]; - - __xray_set_handler(test_handler); - - instrumented_in_executable(); - // CHECK: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{[0-9]+}}, object=0, fn={{[0-9]+}}, type=1 - - a(); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: a called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ1]], fn=1, type=1 - - // Make sure this object ID does not appear again - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ1]] - - b(); // b calls c - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: b called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: c called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ3]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ3]] - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ2]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ2]] - - // Now check explicit loading with RTLD_LOCAL - - void *dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_LOCAL); - void *dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_LOCAL); - // CHECK-NOT: failed to load shared library - - find_and_call(dso_handle_d, "_Z1dv"); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: d called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ4]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ4]] - - find_and_call(dso_handle_e, "_Z1ev"); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: e called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: f called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ6]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ6]] - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ5]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ5]] - - // Unload DSOs - dlclose(dso_handle_d); - dlclose(dso_handle_e); - - // Repeat test with RTLD_GLOBAL - dso_handle_d = load_dso(dso_path_d, RTLD_LAZY | RTLD_GLOBAL); - dso_handle_e = load_dso(dso_path_e, RTLD_LAZY | RTLD_GLOBAL); - // CHECK-NOT: failed to load shared library - - find_and_call(dso_handle_d, "_Z1dv"); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: d called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ7]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ7]] - - find_and_call(dso_handle_e, "_Z1ev"); - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: e called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9:[0-9]+]], fn=1, type=0 - // CHECK-NEXT: f called - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ9]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ9]] - // CHECK-NEXT: called: {{[0-9]+}}, object=[[OBJ8]], fn=1, type=1 - // CHECK-NOT: called: {{[0-9]+}}, object=[[OBJ8]] - - auto status = __xray_unpatch(); - printf("unpatching status: %d\n", static_cast(status)); - // CHECK-NEXT: unpatching status: 1 - - dlclose(dso_handle_d); - dlclose(dso_handle_e); -} - -//--- libgenmacro.inc -#include -// Helper macros to quickly generate libraries containing a single function. -#define GENERATE_LIB(NAME) \ - [[clang::xray_always_instrument]] void NAME() { printf(#NAME " called\n"); } - -#define GENERATE_LIB_WITH_CALL(NAME, FN) \ - extern void FN(); \ - [[clang::xray_always_instrument]] void NAME() { \ - printf(#NAME " called\n"); \ - FN(); \ - } - -//--- testliba.cpp -#include "libgenmacro.inc" -GENERATE_LIB(a) - -//--- testlibb.cpp -#include "libgenmacro.inc" -GENERATE_LIB_WITH_CALL(b, c) - -//--- testlibc.cpp -#include "libgenmacro.inc" -GENERATE_LIB(c) - -//--- testlibd.cpp -#include "libgenmacro.inc" -GENERATE_LIB(d) - -//--- testlibe.cpp -#include "libgenmacro.inc" -GENERATE_LIB_WITH_CALL(e, f) - -//--- testlibf.cpp -#include "libgenmacro.inc" -GENERATE_LIB(f) diff --git a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp deleted file mode 100644 index 0708d0383439..000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/patch-premain-dso.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Checking that DSOs are automatically patched upon load, if patch_premain is passed. - -// RUN: split-file %s %t -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o - -// RUN: XRAY_OPTIONS="patch_premain=true,verbosity=1" %run %t/main.o 2>&1 | FileCheck %s - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include - -void test_handler(int32_t fid, XRayEntryType type) { - printf("called: %d, type=%d\n", fid, static_cast(type)); -} - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); -} - -extern void instrumented_in_dso(); - -int main() { - __xray_set_handler(test_handler); - instrumented_in_executable(); - // CHECK: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{.*}}, type=1 - instrumented_in_dso(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_dso called - // CHECK-NEXT: called: {{.*}}, type=1 -} - -//--- testlib.cpp - -#include - -[[clang::xray_always_instrument]] void instrumented_in_dso() { - printf("instrumented_in_dso called\n"); -} diff --git a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp b/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp deleted file mode 100644 index d3e992dd4977..000000000000 --- a/compiler-rt/test/xray/TestCases/Posix/patching-unpatching-dso.cpp +++ /dev/null @@ -1,75 +0,0 @@ -// Check that we can patch and un-patch on demand, and that logging gets invoked -// appropriately. -// - -// RUN: split-file %s %t -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -shared -std=c++11 %t/testlib.cpp -o %t/testlib.so -// RUN: %clangxx_xray -g -fPIC -fxray-instrument -fxray-shared -std=c++11 %t/main.cpp %t/testlib.so -Wl,-rpath,%t -o %t/main.o - -// RUN: XRAY_OPTIONS="patch_premain=false" %run %t/main.o 2>&1 | FileCheck %s - -// REQUIRES: target=x86_64{{.*}} - -//--- main.cpp - -#include "xray/xray_interface.h" - -#include - -bool called = false; - -void test_handler(int32_t fid, XRayEntryType type) { - printf("called: %d, type=%d\n", fid, static_cast(type)); - called = true; -} - -[[clang::xray_always_instrument]] void instrumented_in_executable() { - printf("instrumented_in_executable called\n"); -} - -extern void instrumented_in_dso(); - -int main() { - __xray_set_handler(test_handler); - instrumented_in_executable(); - // CHECK: instrumented_in_executable called - instrumented_in_dso(); - // CHECK: instrumented_in_dso called - auto status = __xray_patch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 - instrumented_in_executable(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_executable called - // CHECK-NEXT: called: {{.*}}, type=1 - instrumented_in_dso(); - // CHECK-NEXT: called: {{.*}}, type=0 - // CHECK-NEXT: instrumented_in_dso called - // CHECK-NEXT: called: {{.*}}, type=1 - status = __xray_unpatch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 - instrumented_in_executable(); - // CHECK-NEXT: instrumented_in_executable called - instrumented_in_dso(); - // CHECK-NEXT: instrumented_in_dso called - status = __xray_patch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 - __xray_remove_handler(); - instrumented_in_executable(); - // CHECK-NEXT: instrumented_in_executable called - instrumented_in_dso(); - // CHECK-NEXT: instrumented_in_dso called - status = __xray_unpatch(); - printf("patching status: %d\n", static_cast(status)); - // CHECK-NEXT: patching status: 1 -} - -//--- testlib.cpp - -#include - -[[clang::xray_always_instrument]] void instrumented_in_dso() { - printf("instrumented_in_dso called\n"); -} -- GitLab From 14705a912f6296700cef4d2aa7eb100f71dfbd0a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 11 Oct 2024 16:16:12 +0400 Subject: [PATCH 026/345] CodeGen: Remove redundant REQUIRES registered-target from tests (#111982) These are already in target specific test directories. --- llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir | 1 - llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll | 1 - llvm/test/CodeGen/X86/tls-align.ll | 1 - 3 files changed, 3 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir index c1ddc9c14d81..51e9ed6fef2d 100644 --- a/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir +++ b/llvm/test/CodeGen/AArch64/statepoint-twoaddr.mir @@ -1,7 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=aarch64-unknown-linux -run-pass=twoaddressinstruction -verify-machineinstrs %s -o - | FileCheck %s # RUN: llc -mtriple=aarch64-unknown-linux --passes=two-address-instruction -verify-each %s -o - | FileCheck %s -# REQUIRES: aarch64-registered-target # Verify that the register class is correctly constrained after the twoaddress replacement --- diff --git a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll index d0fd6685df3d..cca70005b4cd 100644 --- a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll +++ b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature ; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s -; REQUIRES: amdgpu-registered-target target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" target triple = "amdgcn-amd-amdhsa" diff --git a/llvm/test/CodeGen/X86/tls-align.ll b/llvm/test/CodeGen/X86/tls-align.ll index e996c00dbf1d..94f9b9045cf2 100644 --- a/llvm/test/CodeGen/X86/tls-align.ll +++ b/llvm/test/CodeGen/X86/tls-align.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86-registered-target ; RUN: opt -passes=instcombine -S < %s | FileCheck %s %class.Arr = type <{ [160 x %class.Derived], i32, [4 x i8] }> -- GitLab From 900ea21ffb38ba5b783b20f394c43c6c89d58086 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 11 Oct 2024 05:25:12 -0700 Subject: [PATCH 027/345] [NFC][CodingStandard] Add additional example for if-else brace rule (#111733) Add example to document that single statement `else` needs a brace if the associated `if` needs a brace. --- llvm/docs/CodingStandards.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst index 63df5af2523d..87bbb3d127ad 100644 --- a/llvm/docs/CodingStandards.rst +++ b/llvm/docs/CodingStandards.rst @@ -1713,6 +1713,14 @@ would help to avoid running into a "dangling else" situation. handleOtherDecl(D); } + // Use braces for the `else` block to keep it uniform with the `if` block. + if (isa(D)) { + verifyFunctionDecl(D); + handleFunctionDecl(D); + } else { + handleOtherDecl(D); + } + // This should also omit braces. The `for` loop contains only a single // statement, so it shouldn't have braces. The `if` also only contains a // single simple statement (the `for` loop), so it also should omit braces. -- GitLab From fa789dffb1e12c2aece0187aeacc48dfb1768340 Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 11 Oct 2024 05:26:03 -0700 Subject: [PATCH 028/345] [NFC] Rename `Intrinsic::getDeclaration` to `getOrInsertDeclaration` (#111752) Rename the function to reflect its correct behavior and to be consistent with `Module::getOrInsertFunction`. This is also in preparation of adding a new `Intrinsic::getDeclaration` that will have behavior similar to `Module::getFunction` (i.e, just lookup, no creation). --- clang/lib/CodeGen/CGBuiltin.cpp | 11 +- clang/lib/CodeGen/CGDecl.cpp | 8 +- clang/lib/CodeGen/CGException.cpp | 4 +- clang/lib/CodeGen/CodeGenFunction.cpp | 4 +- clang/lib/CodeGen/CodeGenModule.cpp | 4 +- clang/lib/CodeGen/Targets/SystemZ.cpp | 4 +- llvm/examples/BrainF/BrainF.cpp | 4 +- llvm/include/llvm-c/Core.h | 4 +- llvm/include/llvm/IR/IntrinsicInst.h | 6 +- llvm/include/llvm/IR/Intrinsics.h | 9 +- llvm/include/llvm/IR/MatrixBuilder.h | 8 +- llvm/lib/AsmParser/LLParser.cpp | 2 +- llvm/lib/CodeGen/ExpandLargeFpConvert.cpp | 2 +- llvm/lib/CodeGen/ExpandMemCmp.cpp | 2 +- llvm/lib/CodeGen/ExpandVectorPredication.cpp | 14 +- llvm/lib/CodeGen/HardwareLoops.cpp | 12 +- llvm/lib/CodeGen/IntrinsicLowering.cpp | 2 +- llvm/lib/CodeGen/SafeStack.cpp | 3 +- llvm/lib/CodeGen/SjLjEHPrepare.cpp | 22 +- llvm/lib/CodeGen/StackProtector.cpp | 5 +- llvm/lib/CodeGen/WasmEHPrepare.cpp | 15 +- llvm/lib/IR/AutoUpgrade.cpp | 318 +++++++++--------- llvm/lib/IR/Core.cpp | 2 +- llvm/lib/IR/DIBuilder.cpp | 8 +- llvm/lib/IR/DebugProgramInstruction.cpp | 8 +- llvm/lib/IR/IRBuilder.cpp | 96 +++--- llvm/lib/IR/IntrinsicInst.cpp | 29 +- llvm/lib/IR/Intrinsics.cpp | 5 +- llvm/lib/IR/Module.cpp | 9 +- llvm/lib/IR/VectorBuilder.cpp | 4 +- .../Target/AArch64/AArch64ISelLowering.cpp | 29 +- .../Target/AArch64/AArch64StackTagging.cpp | 18 +- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- llvm/lib/Target/AArch64/SMEABIPass.cpp | 14 +- .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp | 24 +- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 15 +- .../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 2 +- .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp | 17 +- .../AMDGPU/AMDGPUInstructionSelector.cpp | 4 +- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp | 13 +- .../AMDGPU/AMDGPULowerModuleLDSPass.cpp | 8 +- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 14 +- llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp | 11 +- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 4 +- .../AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp | 4 +- .../Target/AMDGPU/SIAnnotateControlFlow.cpp | 16 +- llvm/lib/Target/ARM/ARMISelLowering.cpp | 25 +- llvm/lib/Target/ARM/ARMParallelDSP.cpp | 13 +- llvm/lib/Target/ARM/MVETailPredication.cpp | 2 +- .../Target/BPF/BPFAbstractMemberAccess.cpp | 2 +- llvm/lib/Target/BPF/BPFAdjustOpt.cpp | 2 +- .../Target/BPF/BPFPreserveStaticOffset.cpp | 2 +- llvm/lib/Target/DirectX/DXILOpLowering.cpp | 4 +- llvm/lib/Target/Hexagon/HexagonGenExtract.cpp | 2 +- .../Target/Hexagon/HexagonISelLowering.cpp | 4 +- .../Hexagon/HexagonLoopIdiomRecognition.cpp | 3 +- .../Target/Hexagon/HexagonVectorCombine.cpp | 11 +- .../LoongArch/LoongArchISelLowering.cpp | 4 +- llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp | 2 +- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 3 +- llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 8 +- .../Target/PowerPC/PPCLowerMASSVEntries.cpp | 2 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 30 +- .../Target/SPIRV/SPIRVPrepareFunctions.cpp | 8 +- llvm/lib/Target/SystemZ/SystemZTDC.cpp | 4 +- .../WebAssemblyLowerEmscriptenEHSjLj.cpp | 2 +- .../WebAssemblyLowerRefTypesIntPtrConv.cpp | 2 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 +- .../Target/X86/X86InstCombineIntrinsic.cpp | 6 +- llvm/lib/Target/X86/X86PartialReduction.cpp | 2 +- llvm/lib/Target/X86/X86WinEHState.cpp | 16 +- .../Target/XCore/XCoreLowerThreadLocal.cpp | 4 +- .../AggressiveInstCombine.cpp | 11 +- llvm/lib/Transforms/Coroutines/Coroutines.cpp | 5 +- llvm/lib/Transforms/IPO/CrossDSOCFI.cpp | 3 +- .../lib/Transforms/IPO/SampleProfileProbe.cpp | 2 +- .../lib/Transforms/IPO/WholeProgramDevirt.cpp | 12 +- .../InstCombine/InstCombineAddSub.cpp | 7 +- .../InstCombine/InstCombineAndOrXor.cpp | 14 +- .../InstCombine/InstCombineCalls.cpp | 29 +- .../InstCombine/InstCombineCasts.cpp | 15 +- .../InstCombine/InstCombineCompares.cpp | 18 +- .../InstCombine/InstCombineSelect.cpp | 19 +- .../InstCombineSimplifyDemanded.cpp | 2 +- .../InstCombine/InstCombineVectorOps.cpp | 8 +- .../InstCombine/InstructionCombining.cpp | 6 +- .../Instrumentation/AddressSanitizer.cpp | 4 +- .../Instrumentation/BoundsChecking.cpp | 2 +- .../Instrumentation/HWAddressSanitizer.cpp | 4 +- llvm/lib/Transforms/Instrumentation/KCFI.cpp | 3 +- .../Instrumentation/MemorySanitizer.cpp | 6 +- .../Instrumentation/PGOInstrumentation.cpp | 23 +- .../Instrumentation/SanitizerCoverage.cpp | 2 +- .../Instrumentation/ThreadSanitizer.cpp | 7 +- .../ObjCARC/ARCRuntimeEntryPoints.h | 2 +- .../Transforms/Scalar/InferAddressSpaces.cpp | 14 +- .../Transforms/Scalar/LoopDataPrefetch.cpp | 2 +- llvm/lib/Transforms/Scalar/LoopFlatten.cpp | 4 +- .../Transforms/Scalar/LoopIdiomRecognize.cpp | 4 +- .../Transforms/Scalar/LowerGuardIntrinsic.cpp | 2 +- .../Scalar/LowerMatrixIntrinsics.cpp | 2 +- .../Transforms/Scalar/MakeGuardsExplicit.cpp | 2 +- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 4 +- .../Scalar/RewriteStatepointsForGC.cpp | 4 +- llvm/lib/Transforms/Scalar/Scalarizer.cpp | 5 +- .../Transforms/Utils/AssumeBundleBuilder.cpp | 3 +- llvm/lib/Transforms/Utils/CloneFunction.cpp | 4 +- llvm/lib/Transforms/Utils/CodeExtractor.cpp | 3 +- .../Utils/EntryExitInstrumenter.cpp | 2 +- llvm/lib/Transforms/Utils/InlineFunction.cpp | 7 +- llvm/lib/Transforms/Utils/IntegerDivision.cpp | 4 +- llvm/lib/Transforms/Utils/Local.cpp | 3 +- .../lib/Transforms/Utils/LowerGlobalDtors.cpp | 4 +- .../Transforms/Utils/MemoryTaggingSupport.cpp | 6 +- llvm/lib/Transforms/Utils/PredicateInfo.cpp | 4 +- .../Utils/PromoteMemoryToRegister.cpp | 2 +- .../Utils/RelLookupTableConverter.cpp | 2 +- .../Utils/ScalarEvolutionExpander.cpp | 4 +- .../lib/Transforms/Utils/SimplifyLibCalls.cpp | 2 +- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 +- .../llvm-reduce/deltas/ReduceOpcodes.cpp | 2 +- .../Analysis/AssumeBundleQueriesTest.cpp | 3 +- llvm/unittests/Analysis/MemorySSATest.cpp | 2 +- llvm/unittests/Analysis/ValueTrackingTest.cpp | 4 +- llvm/unittests/IR/BasicBlockTest.cpp | 8 +- llvm/unittests/IR/DebugInfoTest.cpp | 3 +- llvm/unittests/IR/IRBuilderTest.cpp | 5 +- llvm/unittests/IR/IntrinsicsTest.cpp | 2 +- llvm/unittests/IR/PatternMatch.cpp | 2 +- llvm/unittests/IR/VPIntrinsicTest.cpp | 4 +- .../Transforms/Vectorize/VPlanTest.cpp | 3 +- .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td | 6 +- .../LLVMIR/LLVMToLLVMIRTranslation.cpp | 5 +- mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 7 +- polly/lib/CodeGen/IslExprBuilder.cpp | 12 +- polly/lib/CodeGen/PerfMonitor.cpp | 2 +- 137 files changed, 721 insertions(+), 642 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ff678ee04f9c..059c75fae284 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -13648,7 +13648,7 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID, Value *InfoKind = ConstantInt::get(Int64Ty, C->getSExtValue()); // Built the IR for the preserve_field_info intrinsic. - llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getDeclaration( + llvm::Function *FnGetFieldInfo = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::bpf_preserve_field_info, {FieldAddr->getType()}); return Builder.CreateCall(FnGetFieldInfo, {FieldAddr, InfoKind}); @@ -13670,10 +13670,10 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID, llvm::Function *FnDecl; if (BuiltinID == BPF::BI__builtin_btf_type_id) - FnDecl = llvm::Intrinsic::getDeclaration( + FnDecl = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::bpf_btf_type_id, {}); else - FnDecl = llvm::Intrinsic::getDeclaration( + FnDecl = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::bpf_preserve_type_info, {}); CallInst *Fn = Builder.CreateCall(FnDecl, {SeqNumVal, FlagValue}); Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo); @@ -13708,7 +13708,7 @@ Value *CodeGenFunction::EmitBPFBuiltinExpr(unsigned BuiltinID, Value *FlagValue = ConstantInt::get(Int64Ty, Flag->getSExtValue()); Value *SeqNumVal = ConstantInt::get(Int32Ty, BuiltinSeqNum++); - llvm::Function *IntrinsicFn = llvm::Intrinsic::getDeclaration( + llvm::Function *IntrinsicFn = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::bpf_preserve_enum_value, {}); CallInst *Fn = Builder.CreateCall(IntrinsicFn, {SeqNumVal, EnumStrVal, FlagValue}); @@ -18895,7 +18895,8 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: { } case Builtin::BI__builtin_hlsl_wave_is_first_lane: { Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic(); - return EmitRuntimeCall(Intrinsic::getDeclaration(&CGM.getModule(), ID)); + return EmitRuntimeCall( + Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID)); } case Builtin::BI__builtin_hlsl_elementwise_sign: { auto *Arg0 = E->getArg(0); diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 563f728e29d7..30af9268b30e 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -2509,8 +2509,8 @@ void CodeGenFunction::pushRegularPartialArrayCleanup(llvm::Value *arrayBegin, llvm::Function *CodeGenModule::getLLVMLifetimeStartFn() { if (LifetimeStartFn) return LifetimeStartFn; - LifetimeStartFn = llvm::Intrinsic::getDeclaration(&getModule(), - llvm::Intrinsic::lifetime_start, AllocaInt8PtrTy); + LifetimeStartFn = llvm::Intrinsic::getOrInsertDeclaration( + &getModule(), llvm::Intrinsic::lifetime_start, AllocaInt8PtrTy); return LifetimeStartFn; } @@ -2518,8 +2518,8 @@ llvm::Function *CodeGenModule::getLLVMLifetimeStartFn() { llvm::Function *CodeGenModule::getLLVMLifetimeEndFn() { if (LifetimeEndFn) return LifetimeEndFn; - LifetimeEndFn = llvm::Intrinsic::getDeclaration(&getModule(), - llvm::Intrinsic::lifetime_end, AllocaInt8PtrTy); + LifetimeEndFn = llvm::Intrinsic::getOrInsertDeclaration( + &getModule(), llvm::Intrinsic::lifetime_end, AllocaInt8PtrTy); return LifetimeEndFn; } diff --git a/clang/lib/CodeGen/CGException.cpp b/clang/lib/CodeGen/CGException.cpp index bb2ed237ee9f..44a45413dbc4 100644 --- a/clang/lib/CodeGen/CGException.cpp +++ b/clang/lib/CodeGen/CGException.cpp @@ -1843,7 +1843,7 @@ Address CodeGenFunction::recoverAddrOfEscapedLocal(CodeGenFunction &ParentCGF, std::make_pair(ParentAlloca, ParentCGF.EscapedLocals.size())); int FrameEscapeIdx = InsertPair.first->second; // call ptr @llvm.localrecover(ptr @parentFn, ptr %fp, i32 N) - llvm::Function *FrameRecoverFn = llvm::Intrinsic::getDeclaration( + llvm::Function *FrameRecoverFn = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::localrecover); RecoverCall = Builder.CreateCall( FrameRecoverFn, {ParentCGF.CurFn, ParentFP, @@ -1942,7 +1942,7 @@ void CodeGenFunction::EmitCapturedLocals(CodeGenFunction &ParentCGF, // %1 = call ptr @llvm.localrecover(@"?fin$0@0@main@@",..) // %2 = load ptr, ptr %1, align 8 // ==> %2 is the frame-pointer of outermost host function - llvm::Function *FrameRecoverFn = llvm::Intrinsic::getDeclaration( + llvm::Function *FrameRecoverFn = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::localrecover); ParentFP = Builder.CreateCall( FrameRecoverFn, {ParentCGF.CurFn, ParentFP, diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index e1fd9b72b8d7..f3023c7a20c4 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -463,7 +463,7 @@ void CodeGenFunction::FinishFunction(SourceLocation EndLoc) { EscapeArgs.resize(EscapedLocals.size()); for (auto &Pair : EscapedLocals) EscapeArgs[Pair.second] = Pair.first; - llvm::Function *FrameEscapeFn = llvm::Intrinsic::getDeclaration( + llvm::Function *FrameEscapeFn = llvm::Intrinsic::getOrInsertDeclaration( &CGM.getModule(), llvm::Intrinsic::localescape); CGBuilderTy(*this, AllocaInsertPt).CreateCall(FrameEscapeFn, EscapeArgs); } @@ -3130,7 +3130,7 @@ void CodeGenFunction::emitAlignmentAssumptionCheck( llvm::Instruction *Assumption) { assert(isa_and_nonnull(Assumption) && cast(Assumption)->getCalledOperand() == - llvm::Intrinsic::getDeclaration( + llvm::Intrinsic::getOrInsertDeclaration( Builder.GetInsertBlock()->getParent()->getParent(), llvm::Intrinsic::assume) && "Assumption should be a call to llvm.assume()."); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 5ba098144a74..7a7dea4668ad 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -6218,8 +6218,8 @@ void CodeGenModule::emitIFuncDefinition(GlobalDecl GD) { llvm::Function *CodeGenModule::getIntrinsic(unsigned IID, ArrayRef Tys) { - return llvm::Intrinsic::getDeclaration(&getModule(), (llvm::Intrinsic::ID)IID, - Tys); + return llvm::Intrinsic::getOrInsertDeclaration(&getModule(), + (llvm::Intrinsic::ID)IID, Tys); } static llvm::StringMapEntry & diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp index 56129622f48d..23c96fa5cf98 100644 --- a/clang/lib/CodeGen/Targets/SystemZ.cpp +++ b/clang/lib/CodeGen/Targets/SystemZ.cpp @@ -110,8 +110,8 @@ public: if (Ty->isFloatTy() || Ty->isDoubleTy() || Ty->isFP128Ty()) { llvm::Module &M = CGM.getModule(); auto &Ctx = M.getContext(); - llvm::Function *TDCFunc = - llvm::Intrinsic::getDeclaration(&M, llvm::Intrinsic::s390_tdc, Ty); + llvm::Function *TDCFunc = llvm::Intrinsic::getOrInsertDeclaration( + &M, llvm::Intrinsic::s390_tdc, Ty); unsigned TDCBits = 0; switch (BuiltinID) { case Builtin::BI__builtin_isnan: diff --git a/llvm/examples/BrainF/BrainF.cpp b/llvm/examples/BrainF/BrainF.cpp index ac01961735e1..e62cc7bd591a 100644 --- a/llvm/examples/BrainF/BrainF.cpp +++ b/llvm/examples/BrainF/BrainF.cpp @@ -67,8 +67,8 @@ void BrainF::header(LLVMContext& C) { //declare void @llvm.memset.p0i8.i32(i8 *, i8, i32, i1) Type *Tys[] = {PointerType::getUnqual(C), Type::getInt32Ty(C)}; - Function *memset_func = Intrinsic::getDeclaration(module, Intrinsic::memset, - Tys); + Function *memset_func = + Intrinsic::getOrInsertDeclaration(module, Intrinsic::memset, Tys); //declare i32 @getchar() getchar_func = diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h index 28dc270ca368..55649d89a6b8 100644 --- a/llvm/include/llvm-c/Core.h +++ b/llvm/include/llvm-c/Core.h @@ -2807,10 +2807,10 @@ unsigned LLVMLookupIntrinsicID(const char *Name, size_t NameLen); unsigned LLVMGetIntrinsicID(LLVMValueRef Fn); /** - * Create or insert the declaration of an intrinsic. For overloaded intrinsics, + * Get or insert the declaration of an intrinsic. For overloaded intrinsics, * parameter types must be provided to uniquely identify an overload. * - * @see llvm::Intrinsic::getDeclaration() + * @see llvm::Intrinsic::getOrInsertDeclaration() */ LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod, unsigned ID, diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index 4458126ffa75..920eed01374c 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -568,9 +568,9 @@ public: /// \brief Declares a llvm.vp.* intrinsic in \p M that matches the parameters /// \p Params. Additionally, the load and gather intrinsics require /// \p ReturnType to be specified. - static Function *getDeclarationForParams(Module *M, Intrinsic::ID, - Type *ReturnType, - ArrayRef Params); + static Function *getOrInsertDeclarationForParams(Module *M, Intrinsic::ID, + Type *ReturnType, + ArrayRef Params); static std::optional getMaskParamPos(Intrinsic::ID IntrinsicID); static std::optional getVectorLengthParamPos( diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h index b251036247c5..8c37925732a8 100644 --- a/llvm/include/llvm/IR/Intrinsics.h +++ b/llvm/include/llvm/IR/Intrinsics.h @@ -87,14 +87,15 @@ namespace Intrinsic { /// Return the attributes for an intrinsic. AttributeList getAttributes(LLVMContext &C, ID id); - /// Create or insert an LLVM Function declaration for an intrinsic, and return - /// it. + /// Look up the Function declaration of the intrinsic \p id in the Module + /// \p M. If it does not exist, add a declaration and return it. Otherwise, + /// return the existing declaration. /// - /// The Tys parameter is for intrinsics with overloaded types (e.g., those + /// The \p Tys parameter is for intrinsics with overloaded types (e.g., those /// using iAny, fAny, vAny, or iPTRAny). For a declaration of an overloaded /// intrinsic, Tys must provide exactly one type for each overloaded type in /// the intrinsic. - Function *getDeclaration(Module *M, ID id, ArrayRef Tys = {}); + Function *getOrInsertDeclaration(Module *M, ID id, ArrayRef Tys = {}); /// Looks up Name in NameTable via binary search. NameTable must be sorted /// and all entries must start with "llvm.". If NameTable contains an exact diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h index dbf2cfb7c5e9..3a04ca87f2b5 100644 --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -72,7 +72,7 @@ public: B.getInt32(Columns)}; Type *OverloadedTypes[] = {RetType, Stride->getType()}; - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( getModule(), Intrinsic::matrix_column_major_load, OverloadedTypes); CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); @@ -95,7 +95,7 @@ public: B.getInt32(Rows), B.getInt32(Columns)}; Type *OverloadedTypes[] = {Matrix->getType(), Stride->getType()}; - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( getModule(), Intrinsic::matrix_column_major_store, OverloadedTypes); CallInst *Call = B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); @@ -115,7 +115,7 @@ public: Type *OverloadedTypes[] = {ReturnType}; Value *Ops[] = {Matrix, B.getInt32(Rows), B.getInt32(Columns)}; - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( getModule(), Intrinsic::matrix_transpose, OverloadedTypes); return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); @@ -136,7 +136,7 @@ public: B.getInt32(RHSColumns)}; Type *OverloadedTypes[] = {ReturnType, LHSType, RHSType}; - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( getModule(), Intrinsic::matrix_multiply, OverloadedTypes); return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); } diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index c3b4a8235ce6..5b9bddeb7cfe 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -360,7 +360,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) { OverloadTys)) return error(Info.second, "invalid intrinsic signature"); - U.set(Intrinsic::getDeclaration(M, IID, OverloadTys)); + U.set(Intrinsic::getOrInsertDeclaration(M, IID, OverloadTys)); } Info.first->eraseFromParent(); diff --git a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp index 11f123aa5bed..0a3d0cf8ec93 100644 --- a/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp +++ b/llvm/lib/CodeGen/ExpandLargeFpConvert.cpp @@ -356,7 +356,7 @@ static void expandIToFP(Instruction *IToFP) { Entry->getTerminator()->eraseFromParent(); Function *CTLZ = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, IntTy); + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, IntTy); ConstantInt *True = Builder.getTrue(); // entry: diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index 04222d5b4afd..6d626de0b4e6 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -355,7 +355,7 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType, // Swap bytes if required. if (BSwapSizeType) { - Function *Bswap = Intrinsic::getDeclaration( + Function *Bswap = Intrinsic::getOrInsertDeclaration( CI->getModule(), Intrinsic::bswap, BSwapSizeType); Lhs = Builder.CreateCall(Bswap, Lhs); Rhs = Builder.CreateCall(Bswap, Rhs); diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp index ffe879ff0496..32ba3e91822d 100644 --- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp +++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp @@ -237,7 +237,7 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder, if (ElemCount.isScalable()) { auto *M = Builder.GetInsertBlock()->getModule(); Type *BoolVecTy = VectorType::get(Builder.getInt1Ty(), ElemCount); - Function *ActiveMaskFunc = Intrinsic::getDeclaration( + Function *ActiveMaskFunc = Intrinsic::getOrInsertDeclaration( M, Intrinsic::get_active_lane_mask, {BoolVecTy, EVLParam->getType()}); // `get_active_lane_mask` performs an implicit less-than comparison. Value *ConstZero = Builder.getInt32(0); @@ -299,7 +299,7 @@ Value *CachingVPExpander::expandPredicationToIntCall( case Intrinsic::umin: { Value *Op0 = VPI.getOperand(0); Value *Op1 = VPI.getOperand(1); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName()); replaceOperation(*NewOp, VPI); @@ -308,7 +308,7 @@ Value *CachingVPExpander::expandPredicationToIntCall( case Intrinsic::bswap: case Intrinsic::bitreverse: { Value *Op = VPI.getOperand(0); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp = Builder.CreateCall(Fn, {Op}, VPI.getName()); replaceOperation(*NewOp, VPI); @@ -327,7 +327,7 @@ Value *CachingVPExpander::expandPredicationToFPCall( case Intrinsic::fabs: case Intrinsic::sqrt: { Value *Op0 = VPI.getOperand(0); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp = Builder.CreateCall(Fn, {Op0}, VPI.getName()); replaceOperation(*NewOp, VPI); @@ -337,7 +337,7 @@ Value *CachingVPExpander::expandPredicationToFPCall( case Intrinsic::minnum: { Value *Op0 = VPI.getOperand(0); Value *Op1 = VPI.getOperand(1); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName()); replaceOperation(*NewOp, VPI); @@ -350,7 +350,7 @@ Value *CachingVPExpander::expandPredicationToFPCall( Value *Op0 = VPI.getOperand(0); Value *Op1 = VPI.getOperand(1); Value *Op2 = VPI.getOperand(2); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()}); Value *NewOp; if (Intrinsic::isConstrainedFPIntrinsic(UnpredicatedIntrinsicID)) @@ -594,7 +594,7 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) { // TODO add caching auto *M = VPI.getModule(); Function *VScaleFunc = - Intrinsic::getDeclaration(M, Intrinsic::vscale, Int32Ty); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale, Int32Ty); IRBuilder<> Builder(VPI.getParent(), VPI.getIterator()); Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue()); Value *VScale = Builder.CreateCall(VScaleFunc, {}, "vscale"); diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp index 9205eabcf568..c8a63304a3b6 100644 --- a/llvm/lib/CodeGen/HardwareLoops.cpp +++ b/llvm/lib/CodeGen/HardwareLoops.cpp @@ -512,7 +512,7 @@ Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) { : Intrinsic::test_set_loop_iterations) : (UsePhi ? Intrinsic::start_loop_iterations : Intrinsic::set_loop_iterations); - Function *LoopIter = Intrinsic::getDeclaration(M, ID, Ty); + Function *LoopIter = Intrinsic::getOrInsertDeclaration(M, ID, Ty); Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit); // Use the return value of the intrinsic to control the entry of the loop. @@ -541,9 +541,8 @@ void HardwareLoop::InsertLoopDec() { Attribute::StrictFP)) CondBuilder.setIsFPConstrained(true); - Function *DecFunc = - Intrinsic::getDeclaration(M, Intrinsic::loop_decrement, - LoopDecrement->getType()); + Function *DecFunc = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::loop_decrement, LoopDecrement->getType()); Value *Ops[] = { LoopDecrement }; Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops); Value *OldCond = ExitBranch->getCondition(); @@ -566,9 +565,8 @@ Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) { Attribute::StrictFP)) CondBuilder.setIsFPConstrained(true); - Function *DecFunc = - Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg, - { EltsRem->getType() }); + Function *DecFunc = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::loop_decrement_reg, {EltsRem->getType()}); Value *Ops[] = { EltsRem, LoopDecrement }; Value *Call = CondBuilder.CreateCall(DecFunc, Ops); diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp index 256c081b46e2..f799a8cfc1ba 100644 --- a/llvm/lib/CodeGen/IntrinsicLowering.cpp +++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp @@ -474,7 +474,7 @@ bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) { // Okay, we can do this xform, do so now. Module *M = CI->getModule(); - Function *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty); + Function *Int = Intrinsic::getOrInsertDeclaration(M, Intrinsic::bswap, Ty); Value *Op = CI->getArgOperand(0); Op = CallInst::Create(Int, Op, CI->getName(), CI->getIterator()); diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp index e41d1bfb0e53..a50909af8bfc 100644 --- a/llvm/lib/CodeGen/SafeStack.cpp +++ b/llvm/lib/CodeGen/SafeStack.cpp @@ -368,7 +368,8 @@ Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) { if (!StackGuardVar) { TL.insertSSPDeclarations(*M); - return IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard)); + return IRB.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard)); } return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard"); diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp index 054f7d721596..c4ad9f0b2172 100644 --- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp +++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp @@ -508,17 +508,19 @@ bool SjLjEHPrepareImpl::runOnFunction(Function &F) { PointerType *AllocaPtrTy = M.getDataLayout().getAllocaPtrType(M.getContext()); - FrameAddrFn = - Intrinsic::getDeclaration(&M, Intrinsic::frameaddress, {AllocaPtrTy}); - StackAddrFn = - Intrinsic::getDeclaration(&M, Intrinsic::stacksave, {AllocaPtrTy}); - StackRestoreFn = - Intrinsic::getDeclaration(&M, Intrinsic::stackrestore, {AllocaPtrTy}); + FrameAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::frameaddress, + {AllocaPtrTy}); + StackAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::stacksave, + {AllocaPtrTy}); + StackRestoreFn = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::stackrestore, {AllocaPtrTy}); BuiltinSetupDispatchFn = - Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch); - LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda); - CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite); - FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch); + LSDAAddrFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_lsda); + CallSiteFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_callsite); + FuncCtxFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::eh_sjlj_functioncontext); bool Res = setupEntryBlockAndCallSites(F); return Res; diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp index 1f23838b2de0..a192161bbd94 100644 --- a/llvm/lib/CodeGen/StackProtector.cpp +++ b/llvm/lib/CodeGen/StackProtector.cpp @@ -519,7 +519,8 @@ static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M, if (SupportsSelectionDAGSP) *SupportsSelectionDAGSP = true; TLI->insertSSPDeclarations(*M); - return B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard)); + return B.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard)); } /// Insert code into the entry block that stores the stack guard @@ -540,7 +541,7 @@ static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc, AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot"); Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP); - B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackprotector), + B.CreateCall(Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackprotector), {GuardSlot, AI}); return SupportsSelectionDAGSP; } diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp index 7514d49fb18a..1701b0d04425 100644 --- a/llvm/lib/CodeGen/WasmEHPrepare.cpp +++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp @@ -196,7 +196,7 @@ bool WasmEHPrepareImpl::prepareThrows(Function &F) { bool Changed = false; // wasm.throw() intinsic, which will be lowered to wasm 'throw' instruction. - ThrowF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_throw); + ThrowF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_throw); // Insert an unreachable instruction after a call to @llvm.wasm.throw and // delete all following instructions within the BB, and delete all the dead // children of the BB as well. @@ -260,18 +260,21 @@ bool WasmEHPrepareImpl::prepareEHPads(Function &F) { 0, 2, "selector_gep"); // wasm.landingpad.index() intrinsic, which is to specify landingpad index - LPadIndexF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_landingpad_index); + LPadIndexF = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_landingpad_index); // wasm.lsda() intrinsic. Returns the address of LSDA table for the current // function. - LSDAF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_lsda); + LSDAF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_lsda); // wasm.get.exception() and wasm.get.ehselector() intrinsics. Calls to these // are generated in clang. - GetExnF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_exception); - GetSelectorF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_get_ehselector); + GetExnF = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_get_exception); + GetSelectorF = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_get_ehselector); // wasm.catch() will be lowered down to wasm 'catch' instruction in // instruction selection. - CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch); + CatchF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_catch); // _Unwind_CallPersonality() wrapper function, which calls the personality CallPersonalityF = M.getOrInsertFunction("_Unwind_CallPersonality", diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 215bfc8c6cfe..477b77a6dd53 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -65,7 +65,7 @@ static bool upgradePTESTIntrinsic(Function *F, Intrinsic::ID IID, // Yes, it's old, replace it with new version. rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -81,7 +81,7 @@ static bool upgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID, // Move this function aside and map down. rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -94,7 +94,7 @@ static bool upgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID, return false; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -104,7 +104,7 @@ static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID, return false; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -114,7 +114,7 @@ static bool upgradeX86BF16DPIntrinsic(Function *F, Intrinsic::ID IID, return false; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); return true; } @@ -502,8 +502,8 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name, return false; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::x86_rdtscp); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::x86_rdtscp); return true; } @@ -609,14 +609,15 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name, if (ID != Intrinsic::not_intrinsic) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } return false; // No other 'x86.xop.*' } if (Name == "seh.recoverfp") { - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::eh_recoverfp); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::eh_recoverfp); return true; } @@ -630,15 +631,15 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, Function *&NewFn) { if (Name.starts_with("rbit")) { // '(arm|aarch64).rbit'. - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::bitreverse, - F->arg_begin()->getType()); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::bitreverse, F->arg_begin()->getType()); return true; } if (Name == "thread.pointer") { // '(arm|aarch64).thread.pointer'. - NewFn = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::thread_pointer); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::thread_pointer); return true; } @@ -663,7 +664,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, std::array Tys{ {F->getReturnType(), FixedVectorType::get(Type::getBFloatTy(Ctx), OperandWidth / 16)}}; - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); return true; } return false; // No other '(arm|aarch64).neon.bfdot.*'. @@ -688,7 +689,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, : (Intrinsic::ID)Intrinsic::aarch64_neon_bfmlalt) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } return false; // No other '(arm|aarch64).neon.bfm*.v16i8'. @@ -712,8 +713,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, .StartsWith("vqsubu.", Intrinsic::usub_sat) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, - F->arg_begin()->getType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + F->arg_begin()->getType()); return true; } @@ -733,10 +734,10 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, auto fArgs = F->getFunctionType()->params(); Type *Tys[] = {fArgs[0], fArgs[1]}; if (Groups[1].size() == 1) - NewFn = Intrinsic::getDeclaration(F->getParent(), - StoreInts[fArgs.size() - 3], Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), StoreInts[fArgs.size() - 3], Tys); else - NewFn = Intrinsic::getDeclaration( + NewFn = Intrinsic::getOrInsertDeclaration( F->getParent(), StoreLaneInts[fArgs.size() - 5], Tys); return true; } @@ -810,8 +811,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, .StartsWith("rbit", Intrinsic::bitreverse) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, - F->arg_begin()->getType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + F->arg_begin()->getType()); return true; } @@ -821,8 +822,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, return false; // Invalid IR. VectorType *Ty = dyn_cast(F->getReturnType()); if (Ty && Ty->getElementType()->isFloatingPointTy()) { - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::aarch64_neon_faddp, Ty); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::aarch64_neon_faddp, Ty); return true; } } @@ -840,7 +841,7 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, .Case("mlalt", Intrinsic::aarch64_sve_bfmlalt_lane_v2) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } return false; // No other 'aarch64.sve.bf*.lane'. @@ -861,8 +862,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, auto Args = F->getFunctionType()->params(); Type *Tys[] = {F->getReturnType(), Args[1]}; - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::aarch64_sve_faddqv, Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::aarch64_sve_faddqv, Tys); return true; } @@ -880,8 +881,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, Intrinsic::aarch64_sve_ld3_sret, Intrinsic::aarch64_sve_ld4_sret, }; - NewFn = Intrinsic::getDeclaration(F->getParent(), - LoadIDs[Name[0] - '2'], Ty); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + LoadIDs[Name[0] - '2'], Ty); return true; } return false; // No other 'aarch64.sve.ld*'. @@ -892,8 +893,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, if (Name.starts_with("get")) { // 'aarch64.sve.tuple.get*'. Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()}; - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::vector_extract, Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::vector_extract, Tys); return true; } @@ -901,8 +902,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, // 'aarch64.sve.tuple.set*'. auto Args = F->getFunctionType()->params(); Type *Tys[] = {Args[0], Args[2], Args[1]}; - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::vector_insert, Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::vector_insert, Tys); return true; } @@ -911,8 +912,8 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, // 'aarch64.sve.tuple.create*'. auto Args = F->getFunctionType()->params(); Type *Tys[] = {F->getReturnType(), Args[1]}; - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::vector_insert, Tys); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::vector_insert, Tys); return true; } return false; // No other 'aarch64.sve.tuple.*'. @@ -1026,8 +1027,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.consume_front("amdgcn.")) { if (Name == "alignbit") { // Target specific intrinsic became redundant - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::fshr, - {F->getReturnType()}); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::fshr, {F->getReturnType()}); return true; } @@ -1056,9 +1057,9 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.starts_with("ldexp.")) { // Target specific intrinsic became redundant - NewFn = Intrinsic::getDeclaration( - F->getParent(), Intrinsic::ldexp, - {F->getReturnType(), F->getArg(1)->getType()}); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::ldexp, + {F->getReturnType(), F->getArg(1)->getType()}); return true; } break; // No other 'amdgcn.*' @@ -1074,15 +1075,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, - F->arg_begin()->getType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + F->arg_begin()->getType()); return true; } } if (F->arg_size() == 2 && Name == "coro.end") { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::coro_end); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::coro_end); return true; } @@ -1105,7 +1107,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // converted to DbgVariableRecords later. if (Name == "addr" || (Name == "value" && F->arg_size() == 4)) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::dbg_value); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::dbg_value); return true; } break; // No other 'dbg.*'. @@ -1135,7 +1138,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // Inserting overloads the inserted type. Tys.push_back(FT->getParamType(1)); rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); return true; } @@ -1171,8 +1174,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (ID != Intrinsic::not_intrinsic) { rename(F); auto Args = F->getFunctionType()->params(); - NewFn = - Intrinsic::getDeclaration(F->getParent(), ID, {Args[V2 ? 1 : 0]}); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + {Args[V2 ? 1 : 0]}); return true; } break; // No other 'expermental.vector.reduce.*'. @@ -1182,15 +1185,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (Name.consume_front("experimental.stepvector.")) { Intrinsic::ID ID = Intrinsic::stepvector; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, - F->getFunctionType()->getReturnType()); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), ID, F->getFunctionType()->getReturnType()); return true; } break; // No other 'e*'. case 'f': if (Name.starts_with("flt.rounds")) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::get_rounding); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::get_rounding); return true; } break; @@ -1200,8 +1204,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, auto Args = F->getFunctionType()->params(); Type* ObjectPtr[1] = {Args[0]}; rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), - Intrinsic::launder_invariant_group, ObjectPtr); + NewFn = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::launder_invariant_group, ObjectPtr); return true; } break; @@ -1218,7 +1222,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, // Get the types of dest, src, and len ArrayRef ParamTypes = F->getFunctionType()->params().slice(0, 3); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ParamTypes); + NewFn = + Intrinsic::getOrInsertDeclaration(F->getParent(), ID, ParamTypes); return true; } } @@ -1230,8 +1235,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, FT->getParamType(0), // Dest FT->getParamType(2) // len }; - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::memset, - ParamTypes); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::memset, ParamTypes); return true; } break; @@ -1247,8 +1252,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Case("popc.i", Intrinsic::ctpop) .Default(Intrinsic::not_intrinsic); if (IID != Intrinsic::not_intrinsic) { - NewFn = Intrinsic::getDeclaration(F->getParent(), IID, - {F->getReturnType()}); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID, + {F->getReturnType()}); return true; } } @@ -1316,8 +1321,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys, F->getParent())) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::objectsize, - Tys); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::objectsize, Tys); return true; } } @@ -1326,7 +1331,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, case 'p': if (Name.starts_with("ptr.annotation.") && F->arg_size() == 4) { rename(F); - NewFn = Intrinsic::getDeclaration( + NewFn = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::ptr_annotation, {F->arg_begin()->getType(), F->getArg(1)->getType()}); return true; @@ -1345,7 +1350,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (ID != Intrinsic::not_intrinsic) { if (!F->getFunctionType()->getParamType(2)->isIntegerTy(32)) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } break; // No other applicable upgrades. @@ -1359,7 +1364,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (!F->getFunctionType()->getParamType(2)->isIntegerTy(32) || F->getFunctionType()->getReturnType()->isIntegerTy(64)) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } break; // No other applicable upgrades. @@ -1376,7 +1381,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, if (ID != Intrinsic::not_intrinsic) { if (F->getFunctionType()->getReturnType()->isIntegerTy(64)) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } break; // No other applicable upgrades. @@ -1395,7 +1400,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, case 'v': { if (Name == "var.annotation" && F->arg_size() == 4) { rename(F); - NewFn = Intrinsic::getDeclaration( + NewFn = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::var_annotation, {{F->arg_begin()->getType(), F->getArg(1)->getType()}}); return true; @@ -1413,8 +1418,8 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { rename(F); - NewFn = - Intrinsic::getDeclaration(F->getParent(), ID, F->getReturnType()); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, + F->getReturnType()); return true; } @@ -1426,7 +1431,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) { rename(F); - NewFn = Intrinsic::getDeclaration(F->getParent(), ID); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID); return true; } break; // No other 'wasm.dot.i8x16.i7x16.*'. @@ -1740,8 +1745,8 @@ static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI, if (!IndexForm) std::swap(Args[0], Args[1]); - Value *V = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID), - Args); + Value *V = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty) : Builder.CreateBitCast(CI.getArgOperand(1), Ty); @@ -1753,7 +1758,7 @@ static Value *upgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI, Type *Ty = CI.getType(); Value *Op0 = CI.getOperand(0); Value *Op1 = CI.getOperand(1); - Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty); + Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); Value *Res = Builder.CreateCall(Intrin, {Op0, Op1}); if (CI.arg_size() == 4) { // For masked intrinsics. @@ -1780,7 +1785,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallBase &CI, } Intrinsic::ID IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl; - Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty); + Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); Value *Res = Builder.CreateCall(Intrin, {Src, Src, Amt}); if (CI.arg_size() == 4) { // For masked intrinsics. @@ -1850,7 +1855,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallBase &CI, } Intrinsic::ID IID = IsShiftRight ? Intrinsic::fshr : Intrinsic::fshl; - Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty); + Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty); Value *Res = Builder.CreateCall(Intrin, {Op0, Op1, Amt}); unsigned NumArgs = CI.arg_size(); @@ -1911,7 +1916,8 @@ static Value *upgradeMaskedLoad(IRBuilder<> &Builder, Value *Ptr, static Value *upgradeAbs(IRBuilder<> &Builder, CallBase &CI) { Type *Ty = CI.getType(); Value *Op0 = CI.getArgOperand(0); - Function *F = Intrinsic::getDeclaration(CI.getModule(), Intrinsic::abs, Ty); + Function *F = + Intrinsic::getOrInsertDeclaration(CI.getModule(), Intrinsic::abs, Ty); Value *Res = Builder.CreateCall(F, {Op0, Builder.getInt1(false)}); if (CI.arg_size() == 3) Res = emitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1)); @@ -2004,7 +2010,7 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallBase &CI, // Replace a masked intrinsic with an older unmasked intrinsic. static Value *upgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI, Intrinsic::ID IID) { - Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID); + Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID); Value *Rep = Builder.CreateCall(Intrin, { CI.getArgOperand(0), CI.getArgOperand(1) }); return emitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2)); @@ -2263,8 +2269,8 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, SmallVector Args(CI.args()); Args.pop_back(); Args.pop_back(); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args); unsigned NumArgs = CI.arg_size(); Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep, CI.getArgOperand(NumArgs - 2)); @@ -2320,8 +2326,8 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64. Value *Arg = CI->getArgOperand(0); Value *Ctlz = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, - {Arg->getType()}), + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, + {Arg->getType()}), {Arg, Builder.getFalse()}, "ctlz"); Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc"); } else if (Name == "popc.ll") { @@ -2329,15 +2335,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, // i64. Value *Arg = CI->getArgOperand(0); Value *Popc = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctpop, - {Arg->getType()}), + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctpop, + {Arg->getType()}), Arg, "ctpop"); Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc"); } else if (Name == "h2f") { - Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::convert_from_fp16, - {Builder.getFloatTy()}), - CI->getArgOperand(0), "h2f"); + Rep = Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::convert_from_fp16, + {Builder.getFloatTy()}), + CI->getArgOperand(0), "h2f"); } else if (Name.consume_front("bitcast.") && (Name == "f2i" || Name == "i2f" || Name == "ll2d" || Name == "d2ll")) { @@ -2373,7 +2379,7 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI, if (IID != Intrinsic::not_intrinsic && !F->getReturnType()->getScalarType()->isBFloatTy()) { rename(F); - Function *NewFn = Intrinsic::getDeclaration(F->getParent(), IID); + Function *NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); SmallVector Args; for (size_t I = 0; I < NewFn->arg_size(); ++I) { Value *Arg = CI->getArgOperand(I); @@ -2480,15 +2486,15 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, } else if (Name == "sse.sqrt.ss" || Name == "sse2.sqrt.sd") { Value *Vec = CI->getArgOperand(0); Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0); - Function *Intr = Intrinsic::getDeclaration(F->getParent(), Intrinsic::sqrt, - Elt0->getType()); + Function *Intr = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::sqrt, Elt0->getType()); Elt0 = Builder.CreateCall(Intr, Elt0); Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0); } else if (Name.starts_with("avx.sqrt.p") || Name.starts_with("sse2.sqrt.p") || Name.starts_with("sse.sqrt.p")) { Rep = - Builder.CreateCall(Intrinsic::getDeclaration( + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::sqrt, CI->getType()), {CI->getArgOperand(0)}); } else if (Name.starts_with("avx512.mask.sqrt.p")) { @@ -2499,13 +2505,13 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, : Intrinsic::x86_avx512_sqrt_pd_512; Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); } else { - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), - Intrinsic::sqrt, - CI->getType()), - {CI->getArgOperand(0)}); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::sqrt, + CI->getType()), + {CI->getArgOperand(0)}); } Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); @@ -2629,8 +2635,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, break; } - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - {CI->getOperand(0), CI->getArgOperand(1)}); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + {CI->getOperand(0), CI->getArgOperand(1)}); Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.mask.fpclass.p")) { Type *OpTy = CI->getArgOperand(0)->getType(); @@ -2652,8 +2659,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else llvm_unreachable("Unexpected intrinsic"); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - {CI->getOperand(0), CI->getArgOperand(1)}); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + {CI->getOperand(0), CI->getArgOperand(1)}); Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.cmp.p")) { SmallVector Args(CI->args()); @@ -2681,8 +2689,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, std::swap(Mask, Args.back()); Args.push_back(Mask); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Args); } else if (Name.starts_with("avx512.mask.cmp.")) { // Integer compare intrinsics. unsigned Imm = cast(CI->getArgOperand(2))->getZExtValue(); @@ -2776,8 +2784,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, cast(CI->getArgOperand(3))->getZExtValue() != 4)) { Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round : Intrinsic::x86_avx512_sitofp_round; - Function *F = - Intrinsic::getDeclaration(CI->getModule(), IID, {DstTy, SrcTy}); + Function *F = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID, + {DstTy, SrcTy}); Rep = Builder.CreateCall(F, {Rep, CI->getArgOperand(3)}); } else { Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt") @@ -2819,7 +2827,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2), ResultTy->getNumElements()); - Function *ELd = Intrinsic::getDeclaration( + Function *ELd = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::masked_expandload, ResultTy); Rep = Builder.CreateCall(ELd, {Ptr, MaskVec, CI->getOperand(1)}); } else if (Name.starts_with("avx512.mask.compress.store.")) { @@ -2834,7 +2842,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, getX86MaskVec(Builder, CI->getArgOperand(2), cast(ResultTy)->getNumElements()); - Function *CSt = Intrinsic::getDeclaration( + Function *CSt = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::masked_compressstore, ResultTy); Rep = Builder.CreateCall(CSt, {CI->getArgOperand(1), Ptr, MaskVec}); } else if (Name.starts_with("avx512.mask.compress.") || @@ -2847,7 +2855,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, bool IsCompress = Name[12] == 'c'; Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress : Intrinsic::x86_avx512_mask_expand; - Function *Intr = Intrinsic::getDeclaration(F->getParent(), IID, ResultTy); + Function *Intr = + Intrinsic::getOrInsertDeclaration(F->getParent(), IID, ResultTy); Rep = Builder.CreateCall(Intr, {CI->getOperand(0), CI->getOperand(1), MaskVec}); } else if (Name.starts_with("xop.vpcom")) { @@ -2910,7 +2919,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, bool ZeroMask = Name[11] == 'z'; Rep = upgradeX86ConcatShift(Builder, *CI, true, ZeroMask); } else if (Name == "sse42.crc32.64.8") { - Function *CRC32 = Intrinsic::getDeclaration( + Function *CRC32 = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::x86_sse42_crc32_32_8); Value *Trunc0 = Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C)); @@ -3405,7 +3414,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_add_pd_512; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3421,7 +3430,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_div_pd_512; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3437,7 +3446,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_mul_pd_512; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3453,7 +3462,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_sub_pd_512; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); } else { Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1)); @@ -3471,13 +3480,13 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble]; Rep = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), IID), + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)}); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2)); } else if (Name.starts_with("avx512.mask.lzcnt.")) { Rep = - Builder.CreateCall(Intrinsic::getDeclaration( + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::ctlz, CI->getType()), {CI->getArgOperand(0), Builder.getInt1(false)}); Rep = @@ -3723,10 +3732,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (NegAcc) Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::fma, - Ops[0]->getType()), - Ops); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma, + Ops[0]->getType()), + Ops); if (IsScalar) Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0); @@ -3738,10 +3747,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0); Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), - Intrinsic::fma, - Ops[0]->getType()), - Ops); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma, + Ops[0]->getType()), + Ops); Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()), Rep, (uint64_t)0); @@ -3781,11 +3790,11 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IID = Intrinsic::x86_avx512_vfmadd_f64; else IID = Intrinsic::x86_avx512_vfmadd_f32; - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), IID); + Function *FMA = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID); Rep = Builder.CreateCall(FMA, Ops); } else { - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma, - A->getType()); + Function *FMA = Intrinsic::getOrInsertDeclaration( + CI->getModule(), Intrinsic::fma, A->getType()); Rep = Builder.CreateCall(FMA, {A, B, C}); } @@ -3837,11 +3846,12 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, else IID = Intrinsic::x86_avx512_vfmadd_pd_512; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - {A, B, C, CI->getArgOperand(4)}); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), + {A, B, C, CI->getArgOperand(4)}); } else { - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma, - A->getType()); + Function *FMA = Intrinsic::getOrInsertDeclaration( + CI->getModule(), Intrinsic::fma, A->getType()); Rep = Builder.CreateCall(FMA, {A, B, C}); } @@ -3868,8 +3878,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = - Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), Ops); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops); } else if (Name.starts_with("avx512.mask.vfmaddsub.p") || Name.starts_with("avx512.mask3.vfmaddsub.p") || Name.starts_with("avx512.maskz.vfmaddsub.p") || @@ -3892,16 +3902,16 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (IsSubAdd) Ops[2] = Builder.CreateFNeg(Ops[2]); - Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID), - Ops); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops); } else { int NumElts = cast(CI->getType())->getNumElements(); Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma, - Ops[0]->getType()); + Function *FMA = Intrinsic::getOrInsertDeclaration( + CI->getModule(), Intrinsic::fma, Ops[0]->getType()); Value *Odd = Builder.CreateCall(FMA, Ops); Ops[2] = Builder.CreateFNeg(Ops[2]); Value *Even = Builder.CreateCall(FMA, Ops); @@ -3944,8 +3954,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2), CI->getArgOperand(3)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru); @@ -3972,8 +3982,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4008,8 +4018,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4038,8 +4048,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; - Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI->getModule(), IID), - Args); + Rep = Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType()) : CI->getArgOperand(0); Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru); @@ -4062,7 +4072,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(2)}; Value *NewCall = Builder.CreateCall( - Intrinsic::getDeclaration(CI->getModule(), IID), Args); + Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args); // Extract the second result and store it. Value *Data = Builder.CreateExtractValue(NewCall, 1); @@ -4108,7 +4118,7 @@ static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI, Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]); - Function *NewF = Intrinsic::getDeclaration(CI->getModule(), NewID); + Function *NewF = Intrinsic::getOrInsertDeclaration(CI->getModule(), NewID); return Builder.CreateCall(NewF, Args, CI->getName()); } @@ -4117,16 +4127,17 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, if (Name == "mve.vctp64.old") { // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the // correct type. - Value *VCTP = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), Intrinsic::arm_mve_vctp64), - CI->getArgOperand(0), CI->getName()); + Value *VCTP = + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::arm_mve_vctp64), + CI->getArgOperand(0), CI->getName()); Value *C1 = Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::arm_mve_pred_v2i, {VectorType::get(Builder.getInt1Ty(), 2, false)}), VCTP); return Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::arm_mve_pred_i2v, {VectorType::get(Builder.getInt1Ty(), 4, false)}), C1); @@ -4188,19 +4199,19 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, Type *Ty = Op->getType(); if (Ty->getScalarSizeInBits() == 1) { Value *C1 = Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::arm_mve_pred_v2i, {VectorType::get(Builder.getInt1Ty(), 4, false)}), Op); Op = Builder.CreateCall( - Intrinsic::getDeclaration(F->getParent(), - Intrinsic::arm_mve_pred_i2v, {V2I1Ty}), + Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::arm_mve_pred_i2v, {V2I1Ty}), C1); } Ops.push_back(Op); } - Function *Fn = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + Function *Fn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); return Builder.CreateCall(Fn, Ops, CI->getName()); } llvm_unreachable("Unknown function for ARM CallBase upgrade."); @@ -5088,7 +5099,8 @@ void llvm::UpgradeARCRuntime(Module &M) { if (!Fn) return; - Function *NewFn = llvm::Intrinsic::getDeclaration(&M, IntrinsicFunc); + Function *NewFn = + llvm::Intrinsic::getOrInsertDeclaration(&M, IntrinsicFunc); for (User *U : make_early_inc_range(Fn->users())) { CallInst *CI = dyn_cast(U); diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp index ee084e870263..1cf998c68500 100644 --- a/llvm/lib/IR/Core.cpp +++ b/llvm/lib/IR/Core.cpp @@ -2468,7 +2468,7 @@ LLVMValueRef LLVMGetIntrinsicDeclaration(LLVMModuleRef Mod, size_t ParamCount) { ArrayRef Tys(unwrap(ParamTypes), ParamCount); auto IID = llvm_map_to_intrinsic_id(ID); - return wrap(llvm::Intrinsic::getDeclaration(unwrap(Mod), IID, Tys)); + return wrap(llvm::Intrinsic::getOrInsertDeclaration(unwrap(Mod), IID, Tys)); } const char *LLVMIntrinsicGetName(unsigned ID, size_t *NameLength) { diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 0db82cdd6373..447a9d651746 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -991,7 +991,7 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, LLVMContext &Ctx = LinkedInstr->getContext(); Module *M = LinkedInstr->getModule(); if (!AssignFn) - AssignFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign); + AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign); std::array Args = { MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)), @@ -1060,7 +1060,7 @@ static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) { } static Function *getDeclareIntrin(Module &M) { - return Intrinsic::getDeclaration(&M, Intrinsic::dbg_declare); + return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare); } DbgInstPtr DIBuilder::insertDbgValueIntrinsic( @@ -1074,7 +1074,7 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic( } if (!ValueFn) - ValueFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_value); + ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value); return insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertBB, InsertBefore); } @@ -1175,7 +1175,7 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, } if (!LabelFn) - LabelFn = Intrinsic::getDeclaration(&M, Intrinsic::dbg_label); + LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label); Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)}; diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp index 0db908211b55..b37dbd534092 100644 --- a/llvm/lib/IR/DebugProgramInstruction.cpp +++ b/llvm/lib/IR/DebugProgramInstruction.cpp @@ -413,13 +413,13 @@ DbgVariableRecord::createDebugIntrinsic(Module *M, // Work out what sort of intrinsic we're going to produce. switch (getType()) { case DbgVariableRecord::LocationType::Declare: - IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare); + IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare); break; case DbgVariableRecord::LocationType::Value: - IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_value); + IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_value); break; case DbgVariableRecord::LocationType::Assign: - IntrinsicFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_assign); + IntrinsicFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign); break; case DbgVariableRecord::LocationType::End: case DbgVariableRecord::LocationType::Any: @@ -459,7 +459,7 @@ DbgVariableRecord::createDebugIntrinsic(Module *M, DbgLabelInst * DbgLabelRecord::createDebugIntrinsic(Module *M, Instruction *InsertBefore) const { - auto *LabelFn = Intrinsic::getDeclaration(M, Intrinsic::dbg_label); + auto *LabelFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_label); Value *Args[] = { MetadataAsValue::get(getDebugLoc()->getContext(), getLabel())}; DbgLabelInst *DbgLabel = cast( diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 8bf695e835c3..3654bf9a9e70 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -91,8 +91,8 @@ Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) { if (cast(Scaling)->isZero()) return Scaling; Module *M = GetInsertBlock()->getParent()->getParent(); - Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::vscale, {Scaling->getType()}); + Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale, + {Scaling->getType()}); CallInst *CI = CreateCall(TheFn, {}, {}, Name); return cast(Scaling)->isOne() ? CI : CreateMul(CI, Scaling); } @@ -142,7 +142,8 @@ CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size, Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)}; Type *Tys[] = { Ptr->getType(), Size->getType() }; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys); + Function *TheFn = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -170,7 +171,8 @@ CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)}; Type *Tys[] = {Dst->getType(), Size->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset_inline, Tys); + Function *TheFn = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset_inline, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -197,7 +199,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet( Value *Ops[] = {Ptr, Val, Size, getInt32(ElementSize)}; Type *Tys[] = {Ptr->getType(), Size->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::memset_element_unordered_atomic, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -227,7 +229,7 @@ CallInst *IRBuilderBase::CreateMemTransferInst( Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() }; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, IntrID, Tys); + Function *TheFn = Intrinsic::getOrInsertDeclaration(M, IntrID, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -265,7 +267,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy( Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)}; Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::memcpy_element_unordered_atomic, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -381,7 +383,7 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove( Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)}; Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration( + Function *TheFn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::memmove_element_unordered_atomic, Tys); CallInst *CI = CreateCall(TheFn, Ops); @@ -411,23 +413,23 @@ CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Src) { Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Src}; Type *Tys[] = { Src->getType() }; - auto Decl = Intrinsic::getDeclaration(M, ID, Tys); + auto Decl = Intrinsic::getOrInsertDeclaration(M, ID, Tys); return CreateCall(Decl, Ops); } CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) { Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; - auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fadd, - {Src->getType()}); + auto Decl = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reduce_fadd, {Src->getType()}); return CreateCall(Decl, Ops); } CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) { Module *M = GetInsertBlock()->getParent()->getParent(); Value *Ops[] = {Acc, Src}; - auto Decl = Intrinsic::getDeclaration(M, Intrinsic::vector_reduce_fmul, - {Src->getType()}); + auto Decl = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reduce_fmul, {Src->getType()}); return CreateCall(Decl, Ops); } @@ -489,8 +491,8 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { "lifetime.start requires the size to be an i64"); Value *Ops[] = { Size, Ptr }; Module *M = BB->getParent()->getParent(); - Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::lifetime_start, {Ptr->getType()}); + Function *TheFn = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::lifetime_start, {Ptr->getType()}); return CreateCall(TheFn, Ops); } @@ -504,8 +506,8 @@ CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) { "lifetime.end requires the size to be an i64"); Value *Ops[] = { Size, Ptr }; Module *M = BB->getParent()->getParent(); - Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::lifetime_end, {Ptr->getType()}); + Function *TheFn = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::lifetime_end, {Ptr->getType()}); return CreateCall(TheFn, Ops); } @@ -523,8 +525,8 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) { // Fill in the single overloaded type: memory object type. Type *ObjectPtr[1] = {Ptr->getType()}; Module *M = BB->getParent()->getParent(); - Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::invariant_start, ObjectPtr); + Function *TheFn = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::invariant_start, ObjectPtr); return CreateCall(TheFn, Ops); } @@ -556,13 +558,13 @@ IRBuilderBase::CreateAssumption(Value *Cond, Value *Ops[] = { Cond }; Module *M = BB->getParent()->getParent(); - Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume); + Function *FnAssume = Intrinsic::getOrInsertDeclaration(M, Intrinsic::assume); return CreateCall(FnAssume, Ops, OpBundles); } Instruction *IRBuilderBase::CreateNoAliasScopeDeclaration(Value *Scope) { Module *M = BB->getModule(); - auto *FnIntrinsic = Intrinsic::getDeclaration( + auto *FnIntrinsic = Intrinsic::getOrInsertDeclaration( M, Intrinsic::experimental_noalias_scope_decl, {}); return CreateCall(FnIntrinsic, {Scope}); } @@ -615,7 +617,7 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef OverloadedTypes, const Twine &Name) { Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, Id, OverloadedTypes); + Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Id, OverloadedTypes); return CreateCall(TheFn, Ops, {}, Name); } @@ -765,9 +767,9 @@ static CallInst *CreateGCStatepointCallCommon( const Twine &Name) { Module *M = Builder->GetInsertBlock()->getParent()->getParent(); // Fill in the one generic type'd argument (the function is also vararg) - Function *FnStatepoint = - Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint, - {ActualCallee.getCallee()->getType()}); + Function *FnStatepoint = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_gc_statepoint, + {ActualCallee.getCallee()->getType()}); std::vector Args = getStatepointArgs( *Builder, ID, NumPatchBytes, ActualCallee.getCallee(), Flags, CallArgs); @@ -820,9 +822,9 @@ static InvokeInst *CreateGCStatepointInvokeCommon( const Twine &Name) { Module *M = Builder->GetInsertBlock()->getParent()->getParent(); // Fill in the one generic type'd argument (the function is also vararg) - Function *FnStatepoint = - Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_statepoint, - {ActualInvokee.getCallee()->getType()}); + Function *FnStatepoint = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_gc_statepoint, + {ActualInvokee.getCallee()->getType()}); std::vector Args = getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee.getCallee(), @@ -875,7 +877,7 @@ CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint, Intrinsic::ID ID = Intrinsic::experimental_gc_result; Module *M = BB->getParent()->getParent(); Type *Types[] = {ResultType}; - Function *FnGCResult = Intrinsic::getDeclaration(M, ID, Types); + Function *FnGCResult = Intrinsic::getOrInsertDeclaration(M, ID, Types); Value *Args[] = {Statepoint}; return CreateCall(FnGCResult, Args, {}, Name); @@ -886,8 +888,8 @@ CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint, Type *ResultType, const Twine &Name) { Module *M = BB->getParent()->getParent(); Type *Types[] = {ResultType}; - Function *FnGCRelocate = - Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, Types); + Function *FnGCRelocate = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_gc_relocate, Types); Value *Args[] = {Statepoint, getInt32(BaseOffset), getInt32(DerivedOffset)}; return CreateCall(FnGCRelocate, Args, {}, Name); @@ -897,7 +899,7 @@ CallInst *IRBuilderBase::CreateGCGetPointerBase(Value *DerivedPtr, const Twine &Name) { Module *M = BB->getParent()->getParent(); Type *PtrTy = DerivedPtr->getType(); - Function *FnGCFindBase = Intrinsic::getDeclaration( + Function *FnGCFindBase = Intrinsic::getOrInsertDeclaration( M, Intrinsic::experimental_gc_get_pointer_base, {PtrTy, PtrTy}); return CreateCall(FnGCFindBase, {DerivedPtr}, {}, Name); } @@ -906,7 +908,7 @@ CallInst *IRBuilderBase::CreateGCGetPointerOffset(Value *DerivedPtr, const Twine &Name) { Module *M = BB->getParent()->getParent(); Type *PtrTy = DerivedPtr->getType(); - Function *FnGCGetOffset = Intrinsic::getDeclaration( + Function *FnGCGetOffset = Intrinsic::getOrInsertDeclaration( M, Intrinsic::experimental_gc_get_pointer_offset, {PtrTy}); return CreateCall(FnGCGetOffset, {DerivedPtr}, {}, Name); } @@ -915,7 +917,7 @@ CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource, const Twine &Name) { Module *M = BB->getModule(); - Function *Fn = Intrinsic::getDeclaration(M, ID, {V->getType()}); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, {V->getType()}); return createCallHelper(Fn, {V}, Name, FMFSource); } @@ -923,7 +925,7 @@ Value *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource, const Twine &Name) { Module *M = BB->getModule(); - Function *Fn = Intrinsic::getDeclaration(M, ID, { LHS->getType() }); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, {LHS->getType()}); if (Value *V = Folder.FoldBinaryIntrinsic(ID, LHS, RHS, Fn->getReturnType(), FMFSource)) return V; @@ -936,7 +938,7 @@ CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID, Instruction *FMFSource, const Twine &Name) { Module *M = BB->getModule(); - Function *Fn = Intrinsic::getDeclaration(M, ID, Types); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, Types); return createCallHelper(Fn, Args, Name, FMFSource); } @@ -963,7 +965,7 @@ CallInst *IRBuilderBase::CreateIntrinsic(Type *RetTy, Intrinsic::ID ID, "Wrong types for intrinsic!"); // TODO: Handle varargs intrinsics. - Function *Fn = Intrinsic::getDeclaration(M, ID, OverloadTys); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, ID, OverloadTys); return createCallHelper(Fn, Args, Name, FMFSource); } @@ -1120,7 +1122,7 @@ Value *IRBuilderBase::CreateLaunderInvariantGroup(Value *Ptr) { "launder.invariant.group only applies to pointers."); auto *PtrType = Ptr->getType(); Module *M = BB->getParent()->getParent(); - Function *FnLaunderInvariantGroup = Intrinsic::getDeclaration( + Function *FnLaunderInvariantGroup = Intrinsic::getOrInsertDeclaration( M, Intrinsic::launder_invariant_group, {PtrType}); assert(FnLaunderInvariantGroup->getReturnType() == PtrType && @@ -1137,7 +1139,7 @@ Value *IRBuilderBase::CreateStripInvariantGroup(Value *Ptr) { auto *PtrType = Ptr->getType(); Module *M = BB->getParent()->getParent(); - Function *FnStripInvariantGroup = Intrinsic::getDeclaration( + Function *FnStripInvariantGroup = Intrinsic::getOrInsertDeclaration( M, Intrinsic::strip_invariant_group, {PtrType}); assert(FnStripInvariantGroup->getReturnType() == PtrType && @@ -1152,7 +1154,8 @@ Value *IRBuilderBase::CreateVectorReverse(Value *V, const Twine &Name) { auto *Ty = cast(V->getType()); if (isa(Ty)) { Module *M = BB->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, Ty); + Function *F = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_reverse, Ty); return Insert(CallInst::Create(F, V), Name); } // Keep the original behaviour for fixed vector @@ -1171,7 +1174,8 @@ Value *IRBuilderBase::CreateVectorSplice(Value *V1, Value *V2, int64_t Imm, if (auto *VTy = dyn_cast(V1->getType())) { Module *M = BB->getParent()->getParent(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::vector_splice, VTy); + Function *F = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::vector_splice, VTy); Value *Ops[] = {V1, V2, getInt32(Imm)}; return Insert(CallInst::Create(F, Ops), Name); @@ -1225,7 +1229,7 @@ Value *IRBuilderBase::CreatePreserveArrayAccessIndex( Type *ResultType = GetElementPtrInst::getGEPReturnType(Base, IdxList); Module *M = BB->getParent()->getParent(); - Function *FnPreserveArrayAccessIndex = Intrinsic::getDeclaration( + Function *FnPreserveArrayAccessIndex = Intrinsic::getOrInsertDeclaration( M, Intrinsic::preserve_array_access_index, {ResultType, BaseType}); Value *DimV = getInt32(Dimension); @@ -1246,7 +1250,7 @@ Value *IRBuilderBase::CreatePreserveUnionAccessIndex( auto *BaseType = Base->getType(); Module *M = BB->getParent()->getParent(); - Function *FnPreserveUnionAccessIndex = Intrinsic::getDeclaration( + Function *FnPreserveUnionAccessIndex = Intrinsic::getOrInsertDeclaration( M, Intrinsic::preserve_union_access_index, {BaseType, BaseType}); Value *DIIndex = getInt32(FieldIndex); @@ -1271,7 +1275,7 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex( GetElementPtrInst::getGEPReturnType(Base, {Zero, GEPIndex}); Module *M = BB->getParent()->getParent(); - Function *FnPreserveStructAccessIndex = Intrinsic::getDeclaration( + Function *FnPreserveStructAccessIndex = Intrinsic::getOrInsertDeclaration( M, Intrinsic::preserve_struct_access_index, {ResultType, BaseType}); Value *DIIndex = getInt32(FieldIndex); @@ -1288,8 +1292,8 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex( Value *IRBuilderBase::createIsFPClass(Value *FPNum, unsigned Test) { ConstantInt *TestV = getInt32(Test); Module *M = BB->getParent()->getParent(); - Function *FnIsFPClass = - Intrinsic::getDeclaration(M, Intrinsic::is_fpclass, {FPNum->getType()}); + Function *FnIsFPClass = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::is_fpclass, {FPNum->getType()}); return CreateCall(FnIsFPClass, {FPNum, TestV}); } diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 0a6c93fde630..002bab8e079e 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -629,9 +629,8 @@ bool VPIntrinsic::canIgnoreVectorLengthParam() const { return false; } -Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, - Type *ReturnType, - ArrayRef Params) { +Function *VPIntrinsic::getOrInsertDeclarationForParams( + Module *M, Intrinsic::ID VPID, Type *ReturnType, ArrayRef Params) { assert(isVPIntrinsic(VPID) && "not a VP intrinsic"); Function *VPFunc; switch (VPID) { @@ -641,7 +640,7 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, OverloadTy = Params[*VPReductionIntrinsic::getVectorParamPos(VPID)]->getType(); - VPFunc = Intrinsic::getDeclaration(M, VPID, OverloadTy); + VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, OverloadTy); break; } case Intrinsic::vp_trunc: @@ -658,43 +657,43 @@ Function *VPIntrinsic::getDeclarationForParams(Module *M, Intrinsic::ID VPID, case Intrinsic::vp_lrint: case Intrinsic::vp_llrint: case Intrinsic::vp_cttz_elts: - VPFunc = - Intrinsic::getDeclaration(M, VPID, {ReturnType, Params[0]->getType()}); + VPFunc = Intrinsic::getOrInsertDeclaration( + M, VPID, {ReturnType, Params[0]->getType()}); break; case Intrinsic::vp_is_fpclass: - VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[0]->getType()}); + VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, {Params[0]->getType()}); break; case Intrinsic::vp_merge: case Intrinsic::vp_select: - VPFunc = Intrinsic::getDeclaration(M, VPID, {Params[1]->getType()}); + VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, {Params[1]->getType()}); break; case Intrinsic::vp_load: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType()}); break; case Intrinsic::experimental_vp_strided_load: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType(), Params[1]->getType()}); break; case Intrinsic::vp_gather: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {ReturnType, Params[0]->getType()}); break; case Intrinsic::vp_store: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {Params[0]->getType(), Params[1]->getType()}); break; case Intrinsic::experimental_vp_strided_store: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {Params[0]->getType(), Params[1]->getType(), Params[2]->getType()}); break; case Intrinsic::vp_scatter: - VPFunc = Intrinsic::getDeclaration( + VPFunc = Intrinsic::getOrInsertDeclaration( M, VPID, {Params[0]->getType(), Params[1]->getType()}); break; case Intrinsic::experimental_vp_splat: - VPFunc = Intrinsic::getDeclaration(M, VPID, ReturnType); + VPFunc = Intrinsic::getOrInsertDeclaration(M, VPID, ReturnType); break; } assert(VPFunc && "Could not declare VP intrinsic"); diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp index ef26b1926b97..ff8b4b7a020c 100644 --- a/llvm/lib/IR/Intrinsics.cpp +++ b/llvm/lib/IR/Intrinsics.cpp @@ -713,7 +713,8 @@ Intrinsic::ID Intrinsic::lookupIntrinsicID(StringRef Name) { #include "llvm/IR/IntrinsicImpl.inc" #undef GET_INTRINSIC_ATTRIBUTES -Function *Intrinsic::getDeclaration(Module *M, ID id, ArrayRef Tys) { +Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id, + ArrayRef Tys) { // There can never be multiple globals with the same name of different types, // because intrinsics must be a specific type. auto *FT = getType(M->getContext(), id, Tys); @@ -1078,7 +1079,7 @@ std::optional Intrinsic::remangleIntrinsicFunction(Function *F) { // invalid and we'll get an error. ExistingGV->setName(WantedName + ".renamed"); } - return Intrinsic::getDeclaration(F->getParent(), ID, ArgTys); + return Intrinsic::getOrInsertDeclaration(F->getParent(), ID, ArgTys); }(); NewDecl->setCallingConv(F->getCallingConv()); diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp index 704bc8d339bc..ab48d3e4101b 100644 --- a/llvm/lib/IR/Module.cpp +++ b/llvm/lib/IR/Module.cpp @@ -89,21 +89,22 @@ Module::~Module() { void Module::removeDebugIntrinsicDeclarations() { auto *DeclareIntrinsicFn = - Intrinsic::getDeclaration(this, Intrinsic::dbg_declare); + Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_declare); assert((!isMaterialized() || DeclareIntrinsicFn->hasZeroLiveUses()) && "Debug declare intrinsic should have had uses removed."); DeclareIntrinsicFn->eraseFromParent(); auto *ValueIntrinsicFn = - Intrinsic::getDeclaration(this, Intrinsic::dbg_value); + Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_value); assert((!isMaterialized() || ValueIntrinsicFn->hasZeroLiveUses()) && "Debug value intrinsic should have had uses removed."); ValueIntrinsicFn->eraseFromParent(); auto *AssignIntrinsicFn = - Intrinsic::getDeclaration(this, Intrinsic::dbg_assign); + Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_assign); assert((!isMaterialized() || AssignIntrinsicFn->hasZeroLiveUses()) && "Debug assign intrinsic should have had uses removed."); AssignIntrinsicFn->eraseFromParent(); - auto *LabelntrinsicFn = Intrinsic::getDeclaration(this, Intrinsic::dbg_label); + auto *LabelntrinsicFn = + Intrinsic::getOrInsertDeclaration(this, Intrinsic::dbg_label); assert((!isMaterialized() || LabelntrinsicFn->hasZeroLiveUses()) && "Debug label intrinsic should have had uses removed."); LabelntrinsicFn->eraseFromParent(); diff --git a/llvm/lib/IR/VectorBuilder.cpp b/llvm/lib/IR/VectorBuilder.cpp index f42948ba8904..737f49b1334d 100644 --- a/llvm/lib/IR/VectorBuilder.cpp +++ b/llvm/lib/IR/VectorBuilder.cpp @@ -108,8 +108,8 @@ Value *VectorBuilder::createVectorInstructionImpl(Intrinsic::ID VPID, if (VLenPosOpt) IntrinParams[*VLenPosOpt] = &requestEVL(); - auto *VPDecl = VPIntrinsic::getDeclarationForParams(&getModule(), VPID, - ReturnTy, IntrinParams); + auto *VPDecl = VPIntrinsic::getOrInsertDeclarationForParams( + &getModule(), VPID, ReturnTy, IntrinParams); return Builder.CreateCall(VPDecl, IntrinParams, Name); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8a217cd1ec5c..ae96e277b5fc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16454,8 +16454,8 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy)); if (Parts.size() == 4) { - auto *F = Intrinsic::getDeclaration(TI->getModule(), - Intrinsic::aarch64_neon_tbl4, VecTy); + auto *F = Intrinsic::getOrInsertDeclaration( + TI->getModule(), Intrinsic::aarch64_neon_tbl4, VecTy); Parts.push_back(ConstantVector::get(MaskConst)); Results.push_back(Builder.CreateCall(F, Parts)); Parts.clear(); @@ -16484,7 +16484,7 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) { break; } - auto *F = Intrinsic::getDeclaration(TI->getModule(), TblID, VecTy); + auto *F = Intrinsic::getOrInsertDeclaration(TI->getModule(), TblID, VecTy); Parts.push_back(ConstantVector::get(MaskConst)); Results.push_back(Builder.CreateCall(F, Parts)); } @@ -16765,9 +16765,10 @@ static Function *getStructuredLoadFunction(Module *M, unsigned Factor, Intrinsic::aarch64_neon_ld3, Intrinsic::aarch64_neon_ld4}; if (Scalable) - return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy}); + return Intrinsic::getOrInsertDeclaration(M, SVELoads[Factor - 2], {LDVTy}); - return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy}); + return Intrinsic::getOrInsertDeclaration(M, NEONLoads[Factor - 2], + {LDVTy, PtrTy}); } static Function *getStructuredStoreFunction(Module *M, unsigned Factor, @@ -16781,9 +16782,10 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor, Intrinsic::aarch64_neon_st3, Intrinsic::aarch64_neon_st4}; if (Scalable) - return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy}); + return Intrinsic::getOrInsertDeclaration(M, SVEStores[Factor - 2], {STVTy}); - return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy}); + return Intrinsic::getOrInsertDeclaration(M, NEONStores[Factor - 2], + {STVTy, PtrTy}); } /// Lower an interleaved load into a ldN intrinsic. @@ -27247,7 +27249,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, if (ValueTy->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; - Function *Ldxr = Intrinsic::getDeclaration(M, Int); + Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int); Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); @@ -27266,7 +27268,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; - Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys); + Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys); const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); @@ -27281,7 +27283,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance( IRBuilderBase &Builder) const { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex)); + Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_clrex)); } Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, @@ -27296,7 +27299,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, if (Val->getType()->getPrimitiveSizeInBits() == 128) { Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; - Function *Stxr = Intrinsic::getDeclaration(M, Int); + Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Type *Int128Ty = Type::getInt128Ty(M->getContext()); @@ -27311,7 +27314,7 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, Intrinsic::ID Int = IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; Type *Tys[] = { Addr->getType() }; - Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); + Function *Stxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys); const DataLayout &DL = M->getDataLayout(); IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType())); @@ -27348,7 +27351,7 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &, static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); return IRB.CreatePointerCast( IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), Offset), diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp index e62437c28b86..fe96fedcfb82 100644 --- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp @@ -436,10 +436,10 @@ Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst, void AArch64StackTagging::tagAlloca(AllocaInst *AI, Instruction *InsertBefore, Value *Ptr, uint64_t Size) { - auto SetTagZeroFunc = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag_zero); - auto StgpFunc = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_stgp); + auto SetTagZeroFunc = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::aarch64_settag_zero); + auto StgpFunc = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::aarch64_stgp); InitializerBuilder IB(Size, DL, Ptr, SetTagFunc, SetTagZeroFunc, StgpFunc); bool LittleEndian = @@ -481,8 +481,8 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer( assert(PrologueBB); IRBuilder<> IRB(&PrologueBB->front()); - Function *IRG_SP = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_irg_sp); + Function *IRG_SP = Intrinsic::getOrInsertDeclaration( + F->getParent(), Intrinsic::aarch64_irg_sp); Instruction *Base = IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())}); Base->setName("basetag"); @@ -563,8 +563,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { LI = DeleteLI.get(); } - SetTagFunc = - Intrinsic::getDeclaration(F->getParent(), Intrinsic::aarch64_settag); + SetTagFunc = Intrinsic::getOrInsertDeclaration(F->getParent(), + Intrinsic::aarch64_settag); Instruction *Base = insertBaseTaggedPointer(*Fn.getParent(), SInfo.AllocasToInstrument, DT); @@ -580,7 +580,7 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) { NextTag = (NextTag + 1) % 16; // Replace alloca with tagp(alloca). IRBuilder<> IRB(Info.AI->getNextNode()); - Function *TagP = Intrinsic::getDeclaration( + Function *TagP = Intrinsic::getOrInsertDeclaration( F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()}); Instruction *TagPCall = IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 7b74bb2a03a6..91ab3fcfc4c7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1637,7 +1637,7 @@ static std::optional instCombineSVEAllActive(IntrinsicInst &II, return std::nullopt; auto *Mod = II.getModule(); - auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()}); + auto *NewDecl = Intrinsic::getOrInsertDeclaration(Mod, IID, {II.getType()}); II.setCalledFunction(NewDecl); return &II; diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp index 174d95333d91..2ee16a873e33 100644 --- a/llvm/lib/Target/AArch64/SMEABIPass.cpp +++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp @@ -71,7 +71,7 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) { // A save to TPIDR2 should be followed by clearing TPIDR2_EL0. Function *WriteIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_set_tpidr2); Builder.CreateCall(WriteIntr->getFunctionType(), WriteIntr, Builder.getInt64(0)); } @@ -114,7 +114,7 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0. Builder.SetInsertPoint(PreludeBB); Function *TPIDR2Intr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2); auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr, {}, "tpidr2"); auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2, @@ -128,20 +128,20 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, // Enable pstate.za at the start of the function. Builder.SetInsertPoint(&OrigBB->front()); Function *EnableZAIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_za_enable); Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr); } if (FnAttrs.isNewZA()) { Function *ZeroIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_zero); Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr, Builder.getInt32(0xff)); } if (FnAttrs.isNewZT0()) { Function *ClearZT0Intr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero_zt); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_sme_zero_zt); Builder.CreateCall(ClearZT0Intr->getFunctionType(), ClearZT0Intr, {Builder.getInt32(0)}); } @@ -153,8 +153,8 @@ bool SMEABI::updateNewStateFunctions(Module *M, Function *F, if (!T || !isa(T)) continue; Builder.SetInsertPoint(T); - Function *DisableZAIntr = - Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable); + Function *DisableZAIntr = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::aarch64_sme_za_disable); Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr); } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp index f408a013d7a3..ea88ed424dc5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -407,8 +407,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, Value *const Identity) const { Type *AtomicTy = V->getType(); Module *M = B.GetInsertBlock()->getModule(); - Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); + Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_update_dpp, AtomicTy); // Reduce within each row of 16 lanes. for (unsigned Idx = 0; Idx < 4; Idx++) { @@ -439,8 +439,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B, // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and // combine them with a scalar operation. - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); + Function *ReadLane = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_readlane, AtomicTy); Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)}); Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)}); return buildNonAtomicBinOp(B, Op, Lane0, Lane32); @@ -453,8 +453,8 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B, Value *Identity) const { Type *AtomicTy = V->getType(); Module *M = B.GetInsertBlock()->getModule(); - Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); + Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_update_dpp, AtomicTy); for (unsigned Idx = 0; Idx < 4; Idx++) { V = buildNonAtomicBinOp( @@ -513,18 +513,18 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V, Value *Identity) const { Type *AtomicTy = V->getType(); Module *M = B.GetInsertBlock()->getModule(); - Function *UpdateDPP = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy); + Function *UpdateDPP = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_update_dpp, AtomicTy); if (ST->hasDPPWavefrontShifts()) { // GFX9 has DPP wavefront shift operations. V = B.CreateCall(UpdateDPP, {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}); } else { - Function *ReadLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, AtomicTy); - Function *WriteLane = - Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, AtomicTy); + Function *ReadLane = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_readlane, AtomicTy); + Function *WriteLane = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::amdgcn_writelane, AtomicTy); // On GFX10 all DPP operations are confined to a single row. To get cross- // row operations we have to use permlane or readlane. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 052e1140533f..7d3164c79089 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -119,8 +119,8 @@ public: return SqrtF32; LLVMContext &Ctx = Mod->getContext(); - SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt, - {Type::getFloatTy(Ctx)}); + SqrtF32 = Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_sqrt, + {Type::getFloatTy(Ctx)}); return SqrtF32; } @@ -129,7 +129,7 @@ public: return LdexpF32; LLVMContext &Ctx = Mod->getContext(); - LdexpF32 = Intrinsic::getDeclaration( + LdexpF32 = Intrinsic::getOrInsertDeclaration( Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)}); return LdexpF32; } @@ -577,7 +577,7 @@ bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32( Type *I32Ty = getI32Ty(Builder, I.getType()); Function *I32 = - Intrinsic::getDeclaration(Mod, Intrinsic::bitreverse, { I32Ty }); + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::bitreverse, {I32Ty}); Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty); Value *ExtRes = Builder.CreateCall(I32, { ExtOp }); Value *LShrOp = @@ -1260,8 +1260,8 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl( Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty) : Builder.CreateUIToFP(IB,F32Ty); - Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, - Builder.getFloatTy()); + Function *RcpDecl = Intrinsic::getOrInsertDeclaration( + Mod, Intrinsic::amdgcn_rcp, Builder.getFloatTy()); Value *RCP = Builder.CreateCall(RcpDecl, { FB }); Value *FQM = Builder.CreateFMul(FA, RCP); @@ -1455,7 +1455,8 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder, // Initial estimate of inv(y). Value *FloatY = Builder.CreateUIToFP(Y, F32Ty); - Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); + Function *Rcp = + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty); Value *RcpY = Builder.CreateCall(Rcp, {FloatY}); Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast(0x4F7FFFFE)); Value *ScaledY = Builder.CreateFMul(RcpY, Scale); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp index 45207c06a788..e48fed025857 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp @@ -237,7 +237,7 @@ bool optimizeSection(ArrayRef> MergeableInsts) { else NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa; - Function *NewIntrin = Intrinsic::getDeclaration( + Function *NewIntrin = Intrinsic::getOrInsertDeclaration( IIList.front()->getModule(), NewIntrinID, OverloadTys); Args[ImageDimIntr->DMaskIndex] = ConstantInt::get(DMask->getType(), NewMaskVal); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index ecb4d4fa5d5c..6a5a48778197 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -130,7 +130,8 @@ static std::optional modifyIntrinsicCall( // Modify arguments and types Func(Args, ArgTys); - Function *I = Intrinsic::getDeclaration(OldIntr.getModule(), NewIntr, ArgTys); + Function *I = + Intrinsic::getOrInsertDeclaration(OldIntr.getModule(), NewIntr, ArgTys); CallInst *NewCall = IC.Builder.CreateCall(I, Args); NewCall->takeName(&OldIntr); @@ -502,7 +503,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp)) break; - Function *NewDecl = Intrinsic::getDeclaration( + Function *NewDecl = Intrinsic::getOrInsertDeclaration( SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()}); InnerFMF |= FMF; @@ -527,7 +528,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // f16 amdgcn.sqrt is identical to regular sqrt. if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) { - Function *NewDecl = Intrinsic::getDeclaration( + Function *NewDecl = Intrinsic::getOrInsertDeclaration( II.getModule(), Intrinsic::sqrt, {II.getType()}); II.setCalledFunction(NewDecl); return &II; @@ -614,7 +615,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { Value *Src1 = II.getArgOperand(1); const ConstantInt *CMask = dyn_cast(Src1); if (CMask) { - II.setCalledOperand(Intrinsic::getDeclaration( + II.setCalledOperand(Intrinsic::getOrInsertDeclaration( II.getModule(), Intrinsic::is_fpclass, Src0->getType())); // Clamp any excess bits, as they're illegal for the generic intrinsic. @@ -890,7 +891,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // register (which contains the bitmask of live threads). So a // comparison that always returns true is the same as a read of the // EXEC register. - Function *NewF = Intrinsic::getDeclaration( + Function *NewF = Intrinsic::getOrInsertDeclaration( II.getModule(), Intrinsic::read_register, II.getType()); Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")}; MDNode *MD = MDNode::get(II.getContext(), MDArgs); @@ -989,7 +990,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy()) break; - Function *NewF = Intrinsic::getDeclaration( + Function *NewF = Intrinsic::getOrInsertDeclaration( II.getModule(), NewIID, {II.getType(), SrcLHS->getType()}); Value *Args[] = {SrcLHS, SrcRHS, ConstantInt::get(CC->getType(), SrcPred)}; @@ -1205,7 +1206,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const { // If we can prove we don't have one of the special cases then we can use a // normal fma instead. if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) { - II.setCalledOperand(Intrinsic::getDeclaration( + II.setCalledOperand(Intrinsic::getOrInsertDeclaration( II.getModule(), Intrinsic::fma, II.getType())); return &II; } @@ -1401,7 +1402,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask); } - Function *NewIntrin = Intrinsic::getDeclaration( + Function *NewIntrin = Intrinsic::getOrInsertDeclaration( II.getModule(), II.getIntrinsicID(), OverloadTys); CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args); NewCall->takeName(&II); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 53628981e124..800bdbe04cf7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1555,8 +1555,8 @@ bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const { MIB.addImm(MFI->getLDSSize()); } else { Module *M = MF->getFunction().getParent(); - const GlobalValue *GV - = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize); + const GlobalValue *GV = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize); MIB.addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index e01c9dc66a3f..eb553ae4eb80 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -753,7 +753,7 @@ bool AMDGPULibCalls::fold(CallInst *CI) { CI->setArgOperand(1, SplatArg1); } - CI->setCalledFunction(Intrinsic::getDeclaration( + CI->setCalledFunction(Intrinsic::getOrInsertDeclaration( CI->getModule(), Intrinsic::ldexp, {CI->getType(), CI->getArgOperand(1)->getType()})); return true; @@ -1034,7 +1034,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, // pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31)) FunctionCallee ExpExpr; if (ShouldUseIntrinsic) - ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()}); + ExpExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::exp2, + {FPOp->getType()}); else { ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo)); if (!ExpExpr) @@ -1108,8 +1109,8 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, if (needlog) { FunctionCallee LogExpr; if (ShouldUseIntrinsic) { - LogExpr = - Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()}); + LogExpr = Intrinsic::getOrInsertDeclaration(M, Intrinsic::log2, + {FPOp->getType()}); } else { LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo)); if (!LogExpr) @@ -1298,8 +1299,8 @@ void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, } } - CI->setCalledFunction( - Intrinsic::getDeclaration(CI->getModule(), IntrID, {CI->getType()})); + CI->setCalledFunction(Intrinsic::getOrInsertDeclaration( + CI->getModule(), IntrID, {CI->getType()})); } bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic( diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 51a5b7702c00..ff5eb8149010 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -285,8 +285,8 @@ class AMDGPULowerModuleLDS { BasicBlock *Entry = &Func->getEntryBlock(); IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); - Function *Decl = - Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); + Function *Decl = Intrinsic::getOrInsertDeclaration( + Func->getParent(), Intrinsic::donothing, {}); Value *UseInstance[1] = { Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; @@ -529,8 +529,8 @@ public: // block to spare deduplicating it later. auto [It, Inserted] = tableKernelIndexCache.try_emplace(F); if (Inserted) { - Function *Decl = - Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); + Function *Decl = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::amdgcn_lds_kernel_id, {}); auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca(); IRBuilder<> Builder(&*InsertAt); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 24bfbff41ec5..63da3443479b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -973,10 +973,10 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F); if (!IsAMDHSA) { - Function *LocalSizeYFn = - Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y); - Function *LocalSizeZFn = - Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z); + Function *LocalSizeYFn = Intrinsic::getOrInsertDeclaration( + Mod, Intrinsic::r600_read_local_size_y); + Function *LocalSizeZFn = Intrinsic::getOrInsertDeclaration( + Mod, Intrinsic::r600_read_local_size_z); CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {}); CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {}); @@ -1022,7 +1022,7 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) { // } hsa_kernel_dispatch_packet_t // Function *DispatchPtrFn = - Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr); CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {}); DispatchPtr->addRetAttr(Attribute::NoAlias); @@ -1082,7 +1082,7 @@ Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder, llvm_unreachable("invalid dimension"); } - Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID); + Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID); CallInst *CI = Builder.CreateCall(WorkitemIdFn); ST.makeLIDRangeMetadata(CI); F->removeFnAttr(AttrName); @@ -1564,7 +1564,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I, continue; case Intrinsic::objectsize: { Value *Src = Intr->getOperand(0); - Function *ObjectSize = Intrinsic::getDeclaration( + Function *ObjectSize = Intrinsic::getOrInsertDeclaration( Mod, Intrinsic::objectsize, {Intr->getType(), PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index 4669bb45473c..cfce56f0bfe9 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -336,8 +336,8 @@ static void markUsedByKernel(Function *Func, GlobalVariable *SGV) { BasicBlock *Entry = &Func->getEntryBlock(); IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt()); - Function *Decl = - Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {}); + Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(), + Intrinsic::donothing, {}); Value *UseInstance[1] = { Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)}; @@ -922,7 +922,8 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func, StringRef("__asan_free_impl"), FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false)); Value *ReturnAddr = IRB.CreateCall( - Intrinsic::getDeclaration(&M, Intrinsic::returnaddress), IRB.getInt32(0)); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress), + IRB.getInt32(0)); Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty); Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty); IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt}); @@ -1055,8 +1056,8 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses( SetVector LDSInstructions; getLDSMemoryInstructions(Func, LDSInstructions); - Function *Decl = - Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {}); + Function *Decl = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::amdgcn_lds_kernel_id, {}); auto *KernelId = IRB.CreateCall(Decl, {}); GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable; GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index d701bf037fdf..5d7ca89571b2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1112,8 +1112,8 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS)) return nullptr; Module *M = II->getModule(); - Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(), - {DestTy, SrcTy, DestTy}); + Function *NewDecl = Intrinsic::getOrInsertDeclaration( + M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return II; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp index 3758c768b867..59cc61e347bc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp @@ -295,8 +295,8 @@ bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT, // Remove and delete the unreachable inst. UnreachableBlock->getTerminator()->eraseFromParent(); - Function *UnreachableIntrin = - Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable); + Function *UnreachableIntrin = Intrinsic::getOrInsertDeclaration( + F.getParent(), Intrinsic::amdgcn_unreachable); // Insert a call to an intrinsic tracking that this is an unreachable // point, in case we want to kill the active lanes or something later. diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index edd881c84078..a7f2b66e3cd1 100644 --- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -117,13 +117,15 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) { BoolUndef = PoisonValue::get(Boolean); IntMaskZero = ConstantInt::get(IntMask, 0); - If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if, { IntMask }); - Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else, - { IntMask, IntMask }); - IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break, - { IntMask }); - Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask }); - EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask }); + If = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_if, {IntMask}); + Else = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_else, + {IntMask, IntMask}); + IfBreak = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_if_break, + {IntMask}); + Loop = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_loop, {IntMask}); + EndCf = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::amdgcn_end_cf, + {IntMask}); } /// Is the branch condition uniform or did the StructurizeCFG pass diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index bf757edfa858..a35582bebb08 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -21149,7 +21149,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get // here. if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) { - Function *MCR = Intrinsic::getDeclaration(M, Intrinsic::arm_mcr); + Function *MCR = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_mcr); Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(7), Builder.getInt32(10), Builder.getInt32(5)}; @@ -21160,7 +21160,7 @@ Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder, llvm_unreachable("makeDMB on a target so old that it has no barriers"); } } else { - Function *DMB = Intrinsic::getDeclaration(M, Intrinsic::arm_dmb); + Function *DMB = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_dmb); // Only a full system barrier exists in the M-class architectures. Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain; Constant *CDomain = Builder.getInt32(Domain); @@ -21417,7 +21417,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, if (ValueTy->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd; - Function *Ldrex = Intrinsic::getDeclaration(M, Int); + Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int); Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi"); @@ -21433,7 +21433,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Type *Tys[] = { Addr->getType() }; Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex; - Function *Ldrex = Intrinsic::getDeclaration(M, Int, Tys); + Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int, Tys); CallInst *CI = Builder.CreateCall(Ldrex, Addr); CI->addParamAttr( @@ -21446,7 +21446,8 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance( if (!Subtarget->hasV7Ops()) return; Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::arm_clrex)); + Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_clrex)); } Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, @@ -21461,7 +21462,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, if (Val->getType()->getPrimitiveSizeInBits() == 64) { Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd; - Function *Strex = Intrinsic::getDeclaration(M, Int); + Function *Strex = Intrinsic::getOrInsertDeclaration(M, Int); Type *Int32Ty = Type::getInt32Ty(M->getContext()); Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo"); @@ -21473,7 +21474,7 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder, Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex; Type *Tys[] = { Addr->getType() }; - Function *Strex = Intrinsic::getDeclaration(M, Int, Tys); + Function *Strex = Intrinsic::getOrInsertDeclaration(M, Int, Tys); CallInst *CI = Builder.CreateCall( Strex, {Builder.CreateZExtOrBitCast( @@ -21601,8 +21602,8 @@ bool ARMTargetLowering::lowerInterleavedLoad( static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2, Intrinsic::arm_neon_vld3, Intrinsic::arm_neon_vld4}; - Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + Function *VldnFunc = Intrinsic::getOrInsertDeclaration( + LI->getModule(), LoadInts[Factor - 2], Tys); SmallVector Ops; Ops.push_back(BaseAddr); @@ -21617,7 +21618,7 @@ bool ARMTargetLowering::lowerInterleavedLoad( Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace()); Type *Tys[] = {VecTy, PtrTy}; Function *VldnFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys); + Intrinsic::getOrInsertDeclaration(LI->getModule(), LoadInts, Tys); SmallVector Ops; Ops.push_back(BaseAddr); @@ -21762,7 +21763,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace()); Type *Tys[] = {PtrTy, SubVecTy}; - Function *VstNFunc = Intrinsic::getDeclaration( + Function *VstNFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), StoreInts[Factor - 2], Tys); SmallVector Ops; @@ -21778,7 +21779,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI, Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace()); Type *Tys[] = {PtrTy, SubVecTy}; Function *VstNFunc = - Intrinsic::getDeclaration(SI->getModule(), StoreInts, Tys); + Intrinsic::getOrInsertDeclaration(SI->getModule(), StoreInts, Tys); SmallVector Ops; Ops.push_back(BaseAddr); diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp index 861d60d3bcce..7804725ce773 100644 --- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp +++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp @@ -630,13 +630,14 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) { Value* Args[] = { WideLd0, WideLd1, Acc }; Function *SMLAD = nullptr; if (Exchange) - SMLAD = Acc->getType()->isIntegerTy(32) ? - Intrinsic::getDeclaration(M, Intrinsic::arm_smladx) : - Intrinsic::getDeclaration(M, Intrinsic::arm_smlaldx); + SMLAD = + Acc->getType()->isIntegerTy(32) + ? Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smladx) + : Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlaldx); else - SMLAD = Acc->getType()->isIntegerTy(32) ? - Intrinsic::getDeclaration(M, Intrinsic::arm_smlad) : - Intrinsic::getDeclaration(M, Intrinsic::arm_smlald); + SMLAD = Acc->getType()->isIntegerTy(32) + ? Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlad) + : Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_smlald); IRBuilder Builder(InsertAfter->getParent(), BasicBlock::iterator(InsertAfter)); diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index e554e4d428d4..60211db8a61a 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -401,7 +401,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, case 8: VCTPID = Intrinsic::arm_mve_vctp16; break; case 16: VCTPID = Intrinsic::arm_mve_vctp8; break; } - Function *VCTP = Intrinsic::getDeclaration(M, VCTPID); + Function *VCTP = Intrinsic::getOrInsertDeclaration(M, VCTPID); Value *VCTPCall = Builder.CreateCall(VCTP, Processed); ActiveLaneMask->replaceAllUsesWith(VCTPCall); diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp index 4be6220b358b..7921518166f9 100644 --- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -103,7 +103,7 @@ uint32_t BPFCoreSharedInfo::SeqNum; Instruction *BPFCoreSharedInfo::insertPassThrough(Module *M, BasicBlock *BB, Instruction *Input, Instruction *Before) { - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::bpf_passthrough, {Input->getType(), Input->getType()}); Constant *SeqNumVal = ConstantInt::get(Type::getInt32Ty(BB->getContext()), BPFCoreSharedInfo::SeqNum++); diff --git a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp index 4ab0cbcc9247..4ca7bbe9c2a8 100644 --- a/llvm/lib/Target/BPF/BPFAdjustOpt.cpp +++ b/llvm/lib/Target/BPF/BPFAdjustOpt.cpp @@ -126,7 +126,7 @@ bool BPFAdjustOptImpl::adjustICmpToBuiltin() { Constant *Opcode = ConstantInt::get(Type::getInt32Ty(BB.getContext()), Op); - Function *Fn = Intrinsic::getDeclaration( + Function *Fn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::bpf_compare, {Op0->getType(), ConstOp1->getType()}); auto *NewInst = CallInst::Create(Fn, {Opcode, Op0, ConstOp1}); NewInst->insertBefore(&I); diff --git a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp index 5d8339b4a44c..9f7e3414beb8 100644 --- a/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp +++ b/llvm/lib/Target/BPF/BPFPreserveStaticOffset.cpp @@ -163,7 +163,7 @@ static CallInst *makeIntrinsicCall(Module *M, ArrayRef Types, ArrayRef Args) { - Function *Fn = Intrinsic::getDeclaration(M, Intrinsic, Types); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, Intrinsic, Types); return CallInst::Create(Fn, Args); } diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp index c0f8d433833e..99df48508720 100644 --- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp +++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp @@ -134,8 +134,8 @@ public: /// piecemeal way - we can add the casts in to avoid updating all of the uses /// or defs, and by the end all of the casts will be redundant. Value *createTmpHandleCast(Value *V, Type *Ty) { - Function *CastFn = Intrinsic::getDeclaration(&M, Intrinsic::dx_cast_handle, - {Ty, V->getType()}); + Function *CastFn = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::dx_cast_handle, {Ty, V->getType()}); CallInst *Cast = OpBuilder.getIRB().CreateCall(CastFn, {V}); CleanupCasts.push_back(Cast); return Cast; diff --git a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp index 3274f9162b54..65bbb1364488 100644 --- a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp +++ b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp @@ -212,7 +212,7 @@ bool HexagonGenExtract::convert(Instruction *In) { Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu : Intrinsic::hexagon_S2_extractup; Module *Mod = BB->getParent()->getParent(); - Function *ExtF = Intrinsic::getDeclaration(Mod, IntId); + Function *ExtF = Intrinsic::getOrInsertDeclaration(Mod, IntId); Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)}); if (SL != 0) NewIn = IRB.CreateShl(NewIn, SL, CSL->getName()); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 856c952e785d..03c12f5ce447 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3865,7 +3865,7 @@ Value *HexagonTargetLowering::emitLoadLinked(IRBuilderBase &Builder, assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported"); Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked : Intrinsic::hexagon_L4_loadd_locked; - Function *Fn = Intrinsic::getDeclaration(M, IntID); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID); Value *Call = Builder.CreateCall(Fn, Addr, "larx"); @@ -3886,7 +3886,7 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilderBase &Builder, assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported"); Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked : Intrinsic::hexagon_S4_stored_locked; - Function *Fn = Intrinsic::getDeclaration(M, IntID); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID); Val = Builder.CreateBitCast(Val, CastTy); diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 4ef009c87a1e..705e1f43851f 100644 --- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1532,7 +1532,8 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At, ParsedValues &PV) { IRBuilder<> B(&*At); Module *M = At->getParent()->getParent()->getParent(); - Function *PMF = Intrinsic::getDeclaration(M, Intrinsic::hexagon_M4_pmpyw); + Function *PMF = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::hexagon_M4_pmpyw); Value *P = PV.P, *Q = PV.Q, *P0 = P; unsigned IC = PV.IterCount; diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp index f4e495266eae..d2cfd3851e71 100644 --- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp +++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp @@ -2390,8 +2390,8 @@ auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo, Type *Int64Ty = Type::getInt64Ty(F.getContext()); Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst"); Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst"); - Function *FI = Intrinsic::getDeclaration(F.getParent(), - Intrinsic::hexagon_S2_valignrb); + Function *FI = Intrinsic::getOrInsertDeclaration( + F.getParent(), Intrinsic::hexagon_S2_valignrb); Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt}, "cup"); return Builder.CreateBitCast(Call, Lo->getType(), "cst"); } @@ -2587,12 +2587,13 @@ auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder, unsigned HwLen = HST.getVectorLength(); Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast : Intrinsic::hexagon_V6_pred_typecast_128B; - Function *FI = - Intrinsic::getDeclaration(F.getParent(), TC, {DestTy, Val->getType()}); + Function *FI = Intrinsic::getOrInsertDeclaration(F.getParent(), TC, + {DestTy, Val->getType()}); return Builder.CreateCall(FI, {Val}, "cup"); }; - Function *IntrFn = Intrinsic::getDeclaration(F.getParent(), IntID, ArgTys); + Function *IntrFn = + Intrinsic::getOrInsertDeclaration(F.getParent(), IntID, ArgTys); FunctionType *IntrTy = IntrFn->getFunctionType(); SmallVector IntrArgs; diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index bfafb3317521..8edca34624e9 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -5808,7 +5808,7 @@ Value *LoongArchTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty()); Type *Tys[] = {AlignedAddr->getType()}; Function *MaskedCmpXchg = - Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys); + Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys); Value *Result = Builder.CreateCall( MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, FailureOrdering}); Result = Builder.CreateTrunc(Result, Builder.getInt32Ty()); @@ -5838,7 +5838,7 @@ Value *LoongArchTargetLowering::emitMaskedAtomicRMWIntrinsic( Value *Ordering = Builder.getIntN(GRLen, static_cast(AI->getOrdering())); Type *Tys[] = {AlignedAddr->getType()}; - Function *LlwOpScwLoop = Intrinsic::getDeclaration( + Function *LlwOpScwLoop = Intrinsic::getOrInsertDeclaration( AI->getModule(), getIntrinsicForMaskedAtomicRMWBinOp(GRLen, AI->getOperation()), Tys); diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp index 082546c4dd72..1e30e0113e43 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp @@ -282,7 +282,7 @@ static void convertToParamAS(Use *OldUse, Value *Param, bool HasCvtaParam, [](Value *Addr, Instruction *OriginalUser) -> Value * { PointerType *ReturnTy = PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_GENERIC); - Function *CvtToGen = Intrinsic::getDeclaration( + Function *CvtToGen = Intrinsic::getOrInsertDeclaration( OriginalUser->getModule(), Intrinsic::nvvm_ptr_param_to_gen, {ReturnTy, PointerType::get(OriginalUser->getContext(), ADDRESS_SPACE_PARAM)}); diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 9a8ea8f87896..b141229dcfc7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -360,7 +360,8 @@ static Instruction *simplifyNvvmIntrinsic(IntrinsicInst *II, InstCombiner &IC) { // type argument, equal to that of the nvvm intrinsic's argument. Type *Tys[] = {II->getArgOperand(0)->getType()}; return CallInst::Create( - Intrinsic::getDeclaration(II->getModule(), *Action.IID, Tys), Args); + Intrinsic::getOrInsertDeclaration(II->getModule(), *Action.IID, Tys), + Args); } // Simplify to target-generic binary op. diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index d9847a21489e..911d92f0c484 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -12181,7 +12181,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N, static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) { Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getDeclaration(M, Id); + Function *Func = Intrinsic::getOrInsertDeclaration(M, Id); return Builder.CreateCall(Func, {}); } @@ -12206,7 +12206,7 @@ Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder, // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification. if (isa(Inst)) return Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( Builder.GetInsertBlock()->getParent()->getParent(), Intrinsic::ppc_cfence, {Inst->getType()}), {Inst}); @@ -19005,7 +19005,7 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic( Module *M = Builder.GetInsertBlock()->getParent()->getParent(); Type *ValTy = Incr->getType(); assert(ValTy->getPrimitiveSizeInBits() == 128); - Function *RMW = Intrinsic::getDeclaration( + Function *RMW = Intrinsic::getOrInsertDeclaration( M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation())); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo"); @@ -19028,7 +19028,7 @@ Value *PPCTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( Type *ValTy = CmpVal->getType(); assert(ValTy->getPrimitiveSizeInBits() == 128); Function *IntCmpXchg = - Intrinsic::getDeclaration(M, Intrinsic::ppc_cmpxchg_i128); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::ppc_cmpxchg_i128); Type *Int64Ty = Type::getInt64Ty(M->getContext()); Value *CmpLo = Builder.CreateTrunc(CmpVal, Int64Ty, "cmp_lo"); Value *CmpHi = diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp index d10fe11bb587..9c2b58a47392 100644 --- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp +++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp @@ -123,7 +123,7 @@ bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func, return false; CI->setCalledFunction( - Intrinsic::getDeclaration(&M, Intrinsic::pow, CI->getType())); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::pow, CI->getType())); return true; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 230ccd8209e1..1f9fc984515c 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -20608,7 +20608,7 @@ Value *RISCVTargetLowering::emitMaskedAtomicRMWIntrinsic( Value *Ordering = Builder.getIntN(XLen, static_cast(AI->getOrdering())); Type *Tys[] = {AlignedAddr->getType()}; - Function *LrwOpScwLoop = Intrinsic::getDeclaration( + Function *LrwOpScwLoop = Intrinsic::getOrInsertDeclaration( AI->getModule(), getIntrinsicForMaskedAtomicRMWBinOp(XLen, AI->getOperation()), Tys); @@ -20672,7 +20672,7 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic( } Type *Tys[] = {AlignedAddr->getType()}; Function *MaskedCmpXchg = - Intrinsic::getDeclaration(CI->getModule(), CmpXchgIntrID, Tys); + Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys); Value *Result = Builder.CreateCall( MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering}); if (XLen == 64) @@ -21170,7 +21170,7 @@ bool RISCVTargetLowering::preferScalarizeSplat(SDNode *N) const { static Value *useTpOffset(IRBuilderBase &IRB, unsigned Offset) { Module *M = IRB.GetInsertBlock()->getModule(); Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); return IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), Offset); } @@ -21287,9 +21287,9 @@ bool RISCVTargetLowering::lowerInterleavedLoad( auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); - Function *VlsegNFunc = - Intrinsic::getDeclaration(LI->getModule(), FixedVlsegIntrIds[Factor - 2], - {VTy, LI->getPointerOperandType(), XLenTy}); + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( + LI->getModule(), FixedVlsegIntrIds[Factor - 2], + {VTy, LI->getPointerOperandType(), XLenTy}); Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); @@ -21341,9 +21341,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); - Function *VssegNFunc = - Intrinsic::getDeclaration(SI->getModule(), FixedVssegIntrIds[Factor - 2], - {VTy, SI->getPointerOperandType(), XLenTy}); + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + SI->getModule(), FixedVssegIntrIds[Factor - 2], + {VTy, SI->getPointerOperandType(), XLenTy}); auto Mask = SVI->getShuffleMask(); SmallVector Ops; @@ -21388,7 +21388,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); if (auto *FVTy = dyn_cast(ResVTy)) { - Function *VlsegNFunc = Intrinsic::getDeclaration( + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( LI->getModule(), FixedVlsegIntrIds[Factor - 2], {ResVTy, LI->getPointerOperandType(), XLenTy}); Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); @@ -21408,7 +21408,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( NumElts * SEW / 8), Factor); - Function *VlsegNFunc = Intrinsic::getDeclaration( + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( LI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy}); Value *VL = Constant::getAllOnesValue(XLenTy); @@ -21418,7 +21418,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( SmallVector AggrTypes{Factor, ResVTy}; Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getDeclaration( + Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( LI->getModule(), Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy}); for (unsigned i = 0; i < Factor; ++i) { Value *VecExtract = @@ -21454,7 +21454,7 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); if (auto *FVTy = dyn_cast(InVTy)) { - Function *VssegNFunc = Intrinsic::getDeclaration( + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, SI->getPointerOperandType(), XLenTy}); Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); @@ -21475,12 +21475,12 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( NumElts * SEW / 8), Factor); - Function *VssegNFunc = Intrinsic::getDeclaration( + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy}); Value *VL = Constant::getAllOnesValue(XLenTy); - Function *VecInsertFunc = Intrinsic::getDeclaration( + Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( SI->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}); Value *StoredVal = PoisonValue::get(VecTupTy); for (unsigned i = 0; i < Factor; ++i) diff --git a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp index 1872b238d107..ecf9b6ddae1f 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPrepareFunctions.cpp @@ -353,11 +353,11 @@ static void lowerExpectAssume(IntrinsicInst *II) { // We need to lower this into a builtin and then the builtin into a SPIR-V // instruction. if (II->getIntrinsicID() == Intrinsic::assume) { - Function *F = Intrinsic::getDeclaration( + Function *F = Intrinsic::getOrInsertDeclaration( II->getModule(), Intrinsic::SPVIntrinsics::spv_assume); II->setCalledFunction(F); } else if (II->getIntrinsicID() == Intrinsic::expect) { - Function *F = Intrinsic::getDeclaration( + Function *F = Intrinsic::getOrInsertDeclaration( II->getModule(), Intrinsic::SPVIntrinsics::spv_expect, {II->getOperand(0)->getType()}); II->setCalledFunction(F); @@ -372,12 +372,12 @@ static bool toSpvOverloadedIntrinsic(IntrinsicInst *II, Intrinsic::ID NewID, ArrayRef OpNos) { Function *F = nullptr; if (OpNos.empty()) { - F = Intrinsic::getDeclaration(II->getModule(), NewID); + F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID); } else { SmallVector Tys; for (unsigned OpNo : OpNos) Tys.push_back(II->getOperand(OpNo)->getType()); - F = Intrinsic::getDeclaration(II->getModule(), NewID, Tys); + F = Intrinsic::getOrInsertDeclaration(II->getModule(), NewID, Tys); } II->setCalledFunction(F); return true; diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp index f62afb8ddfcf..345327e880ec 100644 --- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp @@ -366,8 +366,8 @@ bool SystemZTDCPass::runOnFunction(Function &F) { if (!Worthy) continue; // Call the intrinsic, compare result with 0. - Function *TDCFunc = - Intrinsic::getDeclaration(&M, Intrinsic::s390_tdc, V->getType()); + Function *TDCFunc = Intrinsic::getOrInsertDeclaration( + &M, Intrinsic::s390_tdc, V->getType()); IRBuilder<> IRB(I); Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask); Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal}); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp index c040e560be60..b999f83507f4 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp @@ -1016,7 +1016,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) { // wasm.catch() will be lowered down to wasm 'catch' instruction in // instruction selection. - CatchF = Intrinsic::getDeclaration(&M, Intrinsic::wasm_catch); + CatchF = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::wasm_catch); // Type for struct __WasmLongjmpArgs LongjmpArgsTy = StructType::get(Int8PtrTy, // env Int32Ty // val diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp index 2594430d1d8f..c61aa5eff4a7 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerRefTypesIntPtrConv.cpp @@ -72,7 +72,7 @@ bool WebAssemblyLowerRefTypesIntPtrConv::runOnFunction(Function &F) { I->replaceAllUsesWith(U); Function *TrapIntrin = - Intrinsic::getDeclaration(F.getParent(), Intrinsic::debugtrap); + Intrinsic::getOrInsertDeclaration(F.getParent(), Intrinsic::debugtrap); CallInst::Create(TrapIntrin, {}, "", I->getIterator()); worklist.insert(&*I); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7a6d20c6a121..de88db222797 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31163,12 +31163,14 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) { auto *C = cast(I->getOperand(I->getOperand(0) == AI ? 1 : 0)); - BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType()); + BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_C, + AI->getType()); unsigned Imm = llvm::countr_zero(C->getZExtValue()); Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)}); } else { - BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType()); + BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_I, + AI->getType()); assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit); @@ -31328,7 +31330,7 @@ void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic( break; } Function *CmpArith = - Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType()); + Intrinsic::getOrInsertDeclaration(AI->getModule(), IID, AI->getType()); Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), PointerType::getUnqual(Ctx)); Value *Call = Builder.CreateCall( @@ -31444,7 +31446,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { return nullptr; Function *MFence = - llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); + llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse2_mfence); Builder.CreateCall(MFence, {}); // Finally we can emit the atomic load. diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp index 77139f38c977..c4374984da4b 100644 --- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp +++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp @@ -1876,7 +1876,8 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0, if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) { Value *Args[] = {Op0, CILength, CIIndex}; Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_extrqi); + Function *F = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_extrqi); return Builder.CreateCall(F, Args); } } @@ -1975,7 +1976,8 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1, Value *Args[] = {Op0, Op1, CILength, CIIndex}; Module *M = II.getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::x86_sse4a_insertqi); + Function *F = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_insertqi); return Builder.CreateCall(F, Args); } diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp index 5bbfabcbd67b..e88702caa9a5 100644 --- a/llvm/lib/Target/X86/X86PartialReduction.cpp +++ b/llvm/lib/Target/X86/X86PartialReduction.cpp @@ -278,7 +278,7 @@ bool X86PartialReduction::trySADReplacement(Instruction *Op) { IntrinsicNumElts = 16; } - Function *PSADBWFn = Intrinsic::getDeclaration(Op->getModule(), IID); + Function *PSADBWFn = Intrinsic::getOrInsertDeclaration(Op->getModule(), IID); if (NumElts < 16) { // Pad input with zeroes. diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp index 963d613ddbfe..05fc6f13129f 100644 --- a/llvm/lib/Target/X86/X86WinEHState.cpp +++ b/llvm/lib/Target/X86/X86WinEHState.cpp @@ -334,7 +334,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { if (UseStackGuard) { Value *Val = Builder.CreateLoad(Int32Ty, Cookie); Value *FrameAddr = Builder.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( TheModule, Intrinsic::frameaddress, Builder.getPtrTy( TheModule->getDataLayout().getAllocaAddrSpace())), @@ -370,7 +370,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) { Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) { return Builder.CreateCall( - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_lsda), F); + Intrinsic::getOrInsertDeclaration(TheModule, Intrinsic::x86_seh_lsda), F); } /// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls @@ -624,17 +624,17 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) { // that it can recover the original frame pointer. IRBuilder<> Builder(RegNode->getNextNode()); Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getPtrTy()); - Builder.CreateCall( - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehregnode), - {RegNodeI8}); + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + TheModule, Intrinsic::x86_seh_ehregnode), + {RegNodeI8}); if (EHGuardNode) { IRBuilder<> Builder(EHGuardNode->getNextNode()); Value *EHGuardNodeI8 = Builder.CreateBitCast(EHGuardNode, Builder.getPtrTy()); - Builder.CreateCall( - Intrinsic::getDeclaration(TheModule, Intrinsic::x86_seh_ehguard), - {EHGuardNodeI8}); + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + TheModule, Intrinsic::x86_seh_ehguard), + {EHGuardNodeI8}); } // Calculate state numbers. diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp index 95962d1a0a24..3604774ddf35 100644 --- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp +++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp @@ -157,8 +157,8 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) { for (User *U : Users) { Instruction *Inst = cast(U); IRBuilder<> Builder(Inst); - Function *GetID = Intrinsic::getDeclaration(GV->getParent(), - Intrinsic::xcore_getid); + Function *GetID = Intrinsic::getOrInsertDeclaration(GV->getParent(), + Intrinsic::xcore_getid); Value *ThreadID = Builder.CreateCall(GetID, {}); Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV, {Builder.getInt64(0), ThreadID}); diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp index 9943c3cbb9fc..898d55fab2b0 100644 --- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -172,7 +172,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) { // %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ] // --> // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt) - Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(Phi.getModule(), IID, Phi.getType()); Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt})); return true; } @@ -331,7 +332,7 @@ static bool tryToRecognizePopCount(Instruction &I) { m_SpecificInt(Mask55)))) { LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n"); IRBuilder<> Builder(&I); - Function *Func = Intrinsic::getDeclaration( + Function *Func = Intrinsic::getOrInsertDeclaration( I.getModule(), Intrinsic::ctpop, I.getType()); I.replaceAllUsesWith(Builder.CreateCall(Func, {Root})); ++NumPopCountRecognized; @@ -398,8 +399,8 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) { return false; IRBuilder<> Builder(&I); - Function *Fn = Intrinsic::getDeclaration(I.getModule(), Intrinsic::fptosi_sat, - {SatTy, FpTy}); + Function *Fn = Intrinsic::getOrInsertDeclaration( + I.getModule(), Intrinsic::fptosi_sat, {SatTy, FpTy}); Value *Sat = Builder.CreateCall(Fn, In); I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy)); return true; @@ -431,7 +432,7 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, IRBuilderBase::FastMathFlagGuard Guard(Builder); Builder.setFastMathFlags(Call->getFastMathFlags()); - Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, Ty); + Function *Sqrt = Intrinsic::getOrInsertDeclaration(M, Intrinsic::sqrt, Ty); Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt"); Call->replaceAllUsesWith(NewSqrt); diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp index 1c45bcd7f6a8..45b9767657c6 100644 --- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp +++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp @@ -52,7 +52,8 @@ coro::LowererBase::LowererBase(Module &M) CallInst *coro::LowererBase::makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt) { auto *IndexVal = ConstantInt::get(Type::getInt8Ty(Context), Index); - auto *Fn = Intrinsic::getDeclaration(&TheModule, Intrinsic::coro_subfn_addr); + auto *Fn = + Intrinsic::getOrInsertDeclaration(&TheModule, Intrinsic::coro_subfn_addr); assert(Index >= CoroSubFnInst::IndexFirst && Index < CoroSubFnInst::IndexLast && @@ -183,7 +184,7 @@ void coro::suppressCoroAllocs(LLVMContext &Context, static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin, CoroSuspendInst *SuspendInst) { Module *M = SuspendInst->getModule(); - auto *Fn = Intrinsic::getDeclaration(M, Intrinsic::coro_save); + auto *Fn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::coro_save); auto *SaveInst = cast( CallInst::Create(Fn, CoroBegin, "", SuspendInst->getIterator())); assert(!SuspendInst->getCoroSave()); diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp index 91d445dfc4c7..9e5d9ea31af6 100644 --- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp +++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp @@ -125,7 +125,8 @@ void CrossDSOCFI::buildCFICheck(Module &M) { ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId); BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F); IRBuilder<> IRBTest(TestBB); - Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test); + Function *BitsetTestFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test); Value *Test = IRBTest.CreateCall( BitsetTestFn, {&Addr, MetadataAsValue::get( diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp index d84856f71c9d..543987d5981b 100644 --- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp @@ -401,7 +401,7 @@ void SampleProfileProber::instrumentOneFunc(Function &F, TargetMachine *TM) { assert(Builder.GetInsertPoint() != BB->end() && "Cannot get the probing point"); Function *ProbeFn = - llvm::Intrinsic::getDeclaration(M, Intrinsic::pseudoprobe); + llvm::Intrinsic::getOrInsertDeclaration(M, Intrinsic::pseudoprobe); Value *Args[] = {Builder.getInt64(Guid), Builder.getInt64(Index), Builder.getInt32(0), Builder.getInt64(PseudoProbeFullDistributionFactor)}; diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 36a1841b3634..59f986b4ca26 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -856,7 +856,7 @@ void llvm::updatePublicTypeTestCalls(Module &M, return; if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) { Function *TypeTestFunc = - Intrinsic::getDeclaration(&M, Intrinsic::type_test); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test); for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) { auto *CI = cast(U.getUser()); auto *NewCI = CallInst::Create( @@ -1187,7 +1187,8 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo, Instruction *ThenTerm = SplitBlockAndInsertIfThen(Cond, &CB, /*Unreachable=*/false); Builder.SetInsertPoint(ThenTerm); - Function *TrapFn = Intrinsic::getDeclaration(&M, Intrinsic::debugtrap); + Function *TrapFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap); auto *CallTrap = Builder.CreateCall(TrapFn); CallTrap->setDebugLoc(CB.getDebugLoc()); } @@ -1434,8 +1435,8 @@ void DevirtModule::tryICallBranchFunnel( } BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr); - Function *Intr = - Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {}); + Function *Intr = Intrinsic::getOrInsertDeclaration( + &M, llvm::Intrinsic::icall_branch_funnel, {}); auto *CI = CallInst::Create(Intr, JTArgs, "", BB); CI->setTailCallKind(CallInst::TCK_MustTail); @@ -2026,7 +2027,8 @@ void DevirtModule::scanTypeTestUsers( } void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) { - Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test); + Function *TypeTestFunc = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test); for (Use &U : llvm::make_early_inc_range(TypeCheckedLoadFunc->uses())) { auto *CI = dyn_cast(U.getUser()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp index e5c3a20e1a64..21588aca5127 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -1232,7 +1232,8 @@ static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) { assert(I.getOpcode() == Instruction::Add && "Expecting add instruction"); Type *Ty = I.getType(); auto getUAddSat = [&]() { - return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty); + return Intrinsic::getOrInsertDeclaration(I.getModule(), Intrinsic::uadd_sat, + Ty); }; // add (umin X, ~Y), Y --> uaddsat X, Y @@ -2127,7 +2128,7 @@ static Instruction *foldSubOfMinMax(BinaryOperator &I, if (match(Op0, m_c_Add(m_Specific(X), m_Specific(Y))) && (Op0->hasOneUse() || Op1->hasOneUse())) { Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID()); - Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty); + Function *F = Intrinsic::getOrInsertDeclaration(I.getModule(), InvID, Ty); return CallInst::Create(F, {X, Y}); } @@ -2150,7 +2151,7 @@ static Instruction *foldSubOfMinMax(BinaryOperator &I, if (MinMax->isSigned() && match(Y, m_ZeroInt()) && match(X, m_NSWSub(m_Specific(Op0), m_Value(Z)))) { Intrinsic::ID InvID = getInverseMinMaxIntrinsic(MinMax->getIntrinsicID()); - Function *F = Intrinsic::getDeclaration(I.getModule(), InvID, Ty); + Function *F = Intrinsic::getOrInsertDeclaration(I.getModule(), InvID, Ty); return CallInst::Create(F, {Op0, Z}); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 964616a4eb35..453071f3f982 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -2269,7 +2269,8 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I, Builder.CreateBinOp(I.getOpcode(), X->getOperand(0), Y->getOperand(0)); Value *NewOp1 = Builder.CreateBinOp(I.getOpcode(), X->getOperand(1), Y->getOperand(1)); - Function *F = Intrinsic::getDeclaration(I.getModule(), IID, I.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(I.getModule(), IID, I.getType()); return CallInst::Create(F, {NewOp0, NewOp1, X->getOperand(2)}); } case Intrinsic::bswap: @@ -2280,7 +2281,8 @@ foldBitwiseLogicWithIntrinsics(BinaryOperator &I, : ConstantInt::get(I.getType(), IID == Intrinsic::bswap ? RHSC->byteSwap() : RHSC->reverseBits())); - Function *F = Intrinsic::getDeclaration(I.getModule(), IID, I.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(I.getModule(), IID, I.getType()); return CallInst::Create(F, {NewOp0}); } default: @@ -3056,7 +3058,8 @@ InstCombinerImpl::convertOrOfShiftsToFunnelShift(Instruction &Or) { static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) { if (auto Opt = IC.convertOrOfShiftsToFunnelShift(Or)) { auto [IID, FShiftArgs] = *Opt; - Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(Or.getModule(), IID, Or.getType()); return CallInst::Create(F, FShiftArgs); } @@ -3095,7 +3098,7 @@ static Instruction *matchOrConcat(Instruction &Or, Value *NewUpper = Builder.CreateZExt(Hi, Ty); NewUpper = Builder.CreateShl(NewUpper, HalfWidth); Value *BinOp = Builder.CreateOr(NewLower, NewUpper); - Function *F = Intrinsic::getDeclaration(Or.getModule(), id, Ty); + Function *F = Intrinsic::getOrInsertDeclaration(Or.getModule(), id, Ty); return Builder.CreateCall(F, BinOp); }; @@ -4803,7 +4806,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) { match(II->getArgOperand(1), m_One()) && isKnownToBeAPowerOfTwo(II->getArgOperand(0), /*OrZero */ true)) { IID = (IID == Intrinsic::ctlz) ? Intrinsic::cttz : Intrinsic::ctlz; - Function *F = Intrinsic::getDeclaration(II->getModule(), IID, Ty); + Function *F = + Intrinsic::getOrInsertDeclaration(II->getModule(), IID, Ty); return CallInst::Create(F, {II->getArgOperand(0), Builder.getTrue()}); } } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index f7a940679180..51e09b7e7c14 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -488,7 +488,8 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) { // cttz(bitreverse(x)) -> ctlz(x) if (match(Op0, m_BitReverse(m_Value(X)))) { Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz; - Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(II.getModule(), ID, II.getType()); return CallInst::Create(F, {X, II.getArgOperand(1)}); } @@ -647,7 +648,7 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { if (Op0->hasOneUse() && match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) { Function *F = - Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); + Intrinsic::getOrInsertDeclaration(II.getModule(), Intrinsic::cttz, Ty); auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()}); auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); @@ -657,7 +658,7 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) { if (match(Op0, m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { Function *F = - Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); + Intrinsic::getOrInsertDeclaration(II.getModule(), Intrinsic::cttz, Ty); return CallInst::Create(F, {X, IC.Builder.getFalse()}); } @@ -1181,7 +1182,8 @@ Instruction *InstCombinerImpl::matchSAddSubSat(IntrinsicInst &MinMax1) { return nullptr; // Finally create and return the sat intrinsic, truncated to the new type - Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy); + Function *F = Intrinsic::getOrInsertDeclaration(MinMax1.getModule(), + IntrinsicID, NewTy); Value *AT = Builder.CreateTrunc(AddSub->getOperand(0), NewTy); Value *BT = Builder.CreateTrunc(AddSub->getOperand(1), NewTy); Value *Sat = Builder.CreateCall(F, {AT, BT}); @@ -1286,8 +1288,8 @@ reassociateMinMaxWithConstantInOperand(IntrinsicInst *II, return nullptr; // max (max X, C), Y --> max (max X, Y), C - Function *MinMax = - Intrinsic::getDeclaration(II->getModule(), MinMaxID, II->getType()); + Function *MinMax = Intrinsic::getOrInsertDeclaration(II->getModule(), + MinMaxID, II->getType()); Value *NewInner = Builder.CreateBinaryIntrinsic(MinMaxID, X, Y); NewInner->takeName(Inner); return CallInst::Create(MinMax, {NewInner, C}); @@ -1346,7 +1348,8 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) { return nullptr; Module *Mod = II->getModule(); - Function *MinMax = Intrinsic::getDeclaration(Mod, MinMaxID, II->getType()); + Function *MinMax = + Intrinsic::getOrInsertDeclaration(Mod, MinMaxID, II->getType()); return CallInst::Create(MinMax, { MinMaxOp, ThirdOp }); } @@ -1571,7 +1574,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Type *Tys[3] = { CI.getArgOperand(0)->getType(), CI.getArgOperand(1)->getType(), CI.getArgOperand(2)->getType() }; - CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys)); + CI.setCalledFunction( + Intrinsic::getOrInsertDeclaration(M, MemCpyID, Tys)); Changed = true; } } @@ -2095,7 +2099,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC); Module *Mod = II->getModule(); - Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty); + Function *Fshl = + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::fshl, Ty); return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC }); } assert(IID == Intrinsic::fshl && @@ -2115,7 +2120,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form) if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) { Module *Mod = II->getModule(); - Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty); + Function *Bswap = + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::bswap, Ty); return CallInst::Create(Bswap, { Op0 }); } if (Instruction *BitOp = @@ -2824,7 +2830,8 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { CallArgs.push_back(II->getArgOperand(4)); } - Function *NewFn = Intrinsic::getDeclaration(II->getModule(), NewIntrin); + Function *NewFn = + Intrinsic::getOrInsertDeclaration(II->getModule(), NewIntrin); return CallInst::Create(NewFn, CallArgs); } case Intrinsic::arm_neon_vtbl1: diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp index 9934c065ebf8..6c2554ea73b7 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -95,8 +95,8 @@ Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty, default: llvm_unreachable("Unsupported call!"); case Intrinsic::vscale: { - Function *Fn = - Intrinsic::getDeclaration(I->getModule(), Intrinsic::vscale, {Ty}); + Function *Fn = Intrinsic::getOrInsertDeclaration( + I->getModule(), Intrinsic::vscale, {Ty}); Res = CallInst::Create(Fn->getFunctionType(), Fn); break; } @@ -600,7 +600,8 @@ Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) { if (ShVal0 != ShVal1) Y = Builder.CreateTrunc(ShVal1, DestTy); Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; - Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy); + Function *F = + Intrinsic::getOrInsertDeclaration(Trunc.getModule(), IID, DestTy); return CallInst::Create(F, {X, Y, NarrowShAmt}); } @@ -1912,8 +1913,8 @@ Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) { // Do unary FP operation on smaller type. // (fptrunc (fabs x)) -> (fabs (fptrunc x)) Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty); - Function *Overload = Intrinsic::getDeclaration(FPT.getModule(), - II->getIntrinsicID(), Ty); + Function *Overload = Intrinsic::getOrInsertDeclaration( + FPT.getModule(), II->getIntrinsicID(), Ty); SmallVector OpBundles; II->getOperandBundlesAsDefs(OpBundles); CallInst *NewCI = @@ -2855,8 +2856,8 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) { if (IntrinsicNum != 0) { assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask"); assert(match(ShufOp1, m_Undef()) && "Unexpected shuffle op"); - Function *BswapOrBitreverse = - Intrinsic::getDeclaration(CI.getModule(), IntrinsicNum, DestTy); + Function *BswapOrBitreverse = Intrinsic::getOrInsertDeclaration( + CI.getModule(), IntrinsicNum, DestTy); Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy); return CallInst::Create(BswapOrBitreverse, {ScalarX}); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index d1eb84b5ca5c..7129499e0f8f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1125,7 +1125,7 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B, // use the sadd_with_overflow intrinsic to efficiently compute both the // result and the overflow bit. Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth); - Function *F = Intrinsic::getDeclaration( + Function *F = Intrinsic::getOrInsertDeclaration( I.getModule(), Intrinsic::sadd_with_overflow, NewType); InstCombiner::BuilderTy &Builder = IC.Builder; @@ -4790,11 +4790,11 @@ Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) { if (MulHadOtherUses) Builder.SetInsertPoint(Mul); - Function *F = Intrinsic::getDeclaration(I.getModule(), - Div->getOpcode() == Instruction::UDiv - ? Intrinsic::umul_with_overflow - : Intrinsic::smul_with_overflow, - X->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + I.getModule(), + Div->getOpcode() == Instruction::UDiv ? Intrinsic::umul_with_overflow + : Intrinsic::smul_with_overflow, + X->getType()); CallInst *Call = Builder.CreateCall(F, {X, Y}, "mul"); // If the multiplication was used elsewhere, to ensure that we don't leave @@ -6334,7 +6334,7 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal, MulA = Builder.CreateZExt(A, MulType); if (WidthB < MulWidth) MulB = Builder.CreateZExt(B, MulType); - Function *F = Intrinsic::getDeclaration( + Function *F = Intrinsic::getOrInsertDeclaration( I.getModule(), Intrinsic::umul_with_overflow, MulType); CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul"); IC.addToWorklist(MulInstr); @@ -7121,8 +7121,8 @@ static Instruction *foldVectorCmp(CmpInst &Cmp, if (auto *I = dyn_cast(V)) I->copyIRFlags(&Cmp); Module *M = Cmp.getModule(); - Function *F = - Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reverse, V->getType()); return CallInst::Create(F, V); }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 3f780285efe4..358563a5fcd5 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1148,8 +1148,8 @@ static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal, if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X))))) return nullptr; - Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz, - II->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + II->getModule(), Intrinsic::cttz, II->getType()); return CallInst::Create(F, {X, II->getArgOperand(1)}); } @@ -2242,8 +2242,8 @@ foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) { else return nullptr; - Function *F = - Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType()); + Function *F = Intrinsic::getOrInsertDeclaration(SI.getModule(), + NewIntrinsicID, SI.getType()); return CallInst::Create(F, {X, Y}); } @@ -2537,7 +2537,8 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel, // This is a funnel/rotate that avoids shift-by-bitwidth UB in a suboptimal way. // Convert to funnel shift intrinsic. Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr; - Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType()); + Function *F = + Intrinsic::getOrInsertDeclaration(Sel.getModule(), IID, Sel.getType()); ShAmt = Builder.CreateZExt(ShAmt, Sel.getType()); return CallInst::Create(F, { SV0, SV1, ShAmt }); } @@ -2580,8 +2581,8 @@ static Instruction *foldSelectToCopysign(SelectInst &Sel, // Canonicalize the magnitude argument as the positive constant since we do // not care about its sign. Value *MagArg = ConstantFP::get(SelType, abs(*TC)); - Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign, - Sel.getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + Sel.getModule(), Intrinsic::copysign, Sel.getType()); return CallInst::Create(F, { MagArg, X }); } @@ -2600,8 +2601,8 @@ Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) { if (auto *I = dyn_cast(V)) I->copyIRFlags(&Sel); Module *M = Sel.getModule(); - Function *F = - Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reverse, V->getType()); return CallInst::Create(F, V); }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 3d4461dc1a87..8ca705ae1d36 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -898,7 +898,7 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Instruction *I, Value *X; if (DemandedMask == 1 && VTy->getScalarSizeInBits() % 2 == 0 && match(II->getArgOperand(0), m_Not(m_Value(X)))) { - Function *Ctpop = Intrinsic::getDeclaration( + Function *Ctpop = Intrinsic::getOrInsertDeclaration( II->getModule(), Intrinsic::ctpop, VTy); return InsertNewInstWith(CallInst::Create(Ctpop, {X}), I->getIterator()); } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp index d9b4faff4c00..d68ae64f08aa 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp @@ -2474,8 +2474,8 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf, if (IsFNeg) return UnaryOperator::CreateFNegFMF(NewShuf, S0); - Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(), - Intrinsic::fabs, Shuf.getType()); + Function *FAbs = Intrinsic::getOrInsertDeclaration( + Shuf.getModule(), Intrinsic::fabs, Shuf.getType()); CallInst *NewF = CallInst::Create(FAbs, {NewShuf}); NewF->setFastMathFlags(S0->getFastMathFlags()); return NewF; @@ -2495,8 +2495,8 @@ static Instruction *foldShuffleOfUnaryOps(ShuffleVectorInst &Shuf, if (IsFNeg) { NewF = UnaryOperator::CreateFNeg(NewShuf); } else { - Function *FAbs = Intrinsic::getDeclaration(Shuf.getModule(), - Intrinsic::fabs, Shuf.getType()); + Function *FAbs = Intrinsic::getOrInsertDeclaration( + Shuf.getModule(), Intrinsic::fabs, Shuf.getType()); NewF = CallInst::Create(FAbs, {NewShuf}); } NewF->copyIRFlags(S0); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 1f4a6f793404..954c4cf19c20 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2084,8 +2084,8 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) { if (auto *BO = dyn_cast(V)) BO->copyIRFlags(&Inst); Module *M = Inst.getModule(); - Function *F = - Intrinsic::getDeclaration(M, Intrinsic::vector_reverse, V->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::vector_reverse, V->getType()); return CallInst::Create(F, V); }; @@ -3355,7 +3355,7 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) { if (InvokeInst *II = dyn_cast(&MI)) { // Replace invoke with a NOP intrinsic to maintain the original CFG Module *M = II->getModule(); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); + Function *F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::donothing); InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), {}, "", II->getParent()); } diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index 2ad89b5ba753..02d9fab309d8 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1109,7 +1109,7 @@ struct FunctionStackPoisoner : public InstVisitor { // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for // this purpose. if (!isa(InstBefore)) { - Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration( + Function *DynamicAreaOffsetFunc = Intrinsic::getOrInsertDeclaration( InstBefore->getModule(), Intrinsic::get_dynamic_area_offset, {IntptrTy}); @@ -1867,7 +1867,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex); Module *M = IRB.GetInsertBlock()->getParent()->getParent(); IRB.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::asan_check_memaccess), + Intrinsic::getOrInsertDeclaration(M, Intrinsic::asan_check_memaccess), {IRB.CreatePointerCast(Addr, PtrTy), ConstantInt::get(Int32Ty, AccessInfo.Packed)}); return; diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp index 618b6fe1aea4..63d580d2b9d5 100644 --- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp +++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -194,7 +194,7 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI, IRB.SetInsertPoint(TrapBB); Intrinsic::ID IntrID = DebugTrapBB ? Intrinsic::ubsantrap : Intrinsic::trap; - auto *F = Intrinsic::getDeclaration(Fn->getParent(), IntrID); + auto *F = Intrinsic::getOrInsertDeclaration(Fn->getParent(), IntrID); CallInst *TrapCall; if (DebugTrapBB) { diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index cc7f20cffea7..5ec4973ea03d 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -1042,14 +1042,14 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite, if (UseFixedShadowIntrinsic) { IRB.CreateCall( - Intrinsic::getDeclaration( + Intrinsic::getOrInsertDeclaration( M, UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow : Intrinsic::hwasan_check_memaccess_fixedshadow), {Ptr, ConstantInt::get(Int32Ty, AccessInfo), ConstantInt::get(Int64Ty, Mapping.offset())}); } else { - IRB.CreateCall(Intrinsic::getDeclaration( + IRB.CreateCall(Intrinsic::getOrInsertDeclaration( M, UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules : Intrinsic::hwasan_check_memaccess), diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp index 28dc1c02b661..bbe0f4c61781 100644 --- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp +++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp @@ -110,7 +110,8 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) { Instruction *ThenTerm = SplitBlockAndInsertIfThen(Test, Call, false, VeryUnlikelyWeights); Builder.SetInsertPoint(ThenTerm); - Builder.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::debugtrap)); + Builder.CreateCall( + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap)); ++NumKCFIChecks; } diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 07d667434e07..19ec97c17f31 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2853,7 +2853,7 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)), S2->getType()); Value *V2 = I.getOperand(2); - Function *Intrin = Intrinsic::getDeclaration( + Function *Intrin = Intrinsic::getOrInsertDeclaration( I.getModule(), I.getIntrinsicID(), S2Conv->getType()); Value *Shift = IRB.CreateCall(Intrin, {S0, S1, V2}); setShadow(&I, IRB.CreateOr(Shift, S2Conv)); @@ -3057,7 +3057,7 @@ struct MemorySanitizerVisitor : public InstVisitor { IRBuilder<> IRB(&I); Value *Op = I.getArgOperand(0); Type *OpType = Op->getType(); - Function *BswapFunc = Intrinsic::getDeclaration( + Function *BswapFunc = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::bswap, ArrayRef(&OpType, 1)); setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op))); setOrigin(&I, getOrigin(Op)); @@ -3287,7 +3287,7 @@ struct MemorySanitizerVisitor : public InstVisitor { S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64)); } - Function *ShadowFn = Intrinsic::getDeclaration( + Function *ShadowFn = Intrinsic::getOrInsertDeclaration( F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID())); Value *S = diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index 10442fa0bb90..e6e474ed3760 100644 --- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -919,7 +919,7 @@ void FunctionInstrumenter::instrument() { // llvm.instrprof.cover(i8* , i64 , i32 , // i32 ) Builder.CreateCall( - Intrinsic::getDeclaration(&M, Intrinsic::instrprof_cover), + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_cover), {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)}); return; } @@ -931,7 +931,7 @@ void FunctionInstrumenter::instrument() { if (IsCtxProf) { auto *CSIntrinsic = - Intrinsic::getDeclaration(&M, Intrinsic::instrprof_callsite); + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_callsite); // We want to count the instrumentable callsites, then instrument them. This // is because the llvm.instrprof.callsite intrinsic has an argument (like // the other instrprof intrinsics) capturing the total number of @@ -972,7 +972,7 @@ void FunctionInstrumenter::instrument() { // llvm.instrprof.timestamp(i8* , i64 , i32 , // i32 ) Builder.CreateCall( - Intrinsic::getDeclaration(&M, Intrinsic::instrprof_timestamp), + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_timestamp), {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), Builder.getInt32(I)}); I += PGOBlockCoverage ? 8 : 1; @@ -984,12 +984,12 @@ void FunctionInstrumenter::instrument() { "Cannot get the Instrumentation point"); // llvm.instrprof.increment(i8* , i64 , i32 , // i32 ) - Builder.CreateCall( - Intrinsic::getDeclaration(&M, PGOBlockCoverage - ? Intrinsic::instrprof_cover - : Intrinsic::instrprof_increment), - {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters), - Builder.getInt32(I++)}); + Builder.CreateCall(Intrinsic::getOrInsertDeclaration( + &M, PGOBlockCoverage + ? Intrinsic::instrprof_cover + : Intrinsic::instrprof_increment), + {NormalizedNamePtr, CFGHash, + Builder.getInt32(NumCounters), Builder.getInt32(I++)}); } // Now instrument select instructions: @@ -1038,7 +1038,8 @@ void FunctionInstrumenter::instrument() { SmallVector OpBundles; populateEHOperandBundle(Cand, BlockColors, OpBundles); Builder.CreateCall( - Intrinsic::getDeclaration(&M, Intrinsic::instrprof_value_profile), + Intrinsic::getOrInsertDeclaration(&M, + Intrinsic::instrprof_value_profile), {NormalizedNamePtr, Builder.getInt64(FuncInfo.FunctionHash), ToProfile, Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}, OpBundles); @@ -1726,7 +1727,7 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) { ConstantExpr::getPointerBitCastOrAddrSpaceCast( FuncNameVar, PointerType::get(M->getContext(), 0)); Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step), + Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step), {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step}); ++(*CurCtrIdx); diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp index db4bf709c9cc..719806fdf37f 100644 --- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp +++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp @@ -999,7 +999,7 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB, if (Options.StackDepth && IsEntryBB && !IsLeafFunc) { // Check stack depth. If it's the deepest so far, record it. Module *M = F.getParent(); - Function *GetFrameAddr = Intrinsic::getDeclaration( + Function *GetFrameAddr = Intrinsic::getOrInsertDeclaration( M, Intrinsic::frameaddress, IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace())); auto FrameAddrPtr = diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 68cf4e553013..388addfab181 100644 --- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -571,9 +571,10 @@ bool ThreadSanitizer::sanitizeFunction(Function &F, // Instrument function entry/exit points if there were instrumented accesses. if ((Res || HasCalls) && ClInstrumentFuncEntryExit) { InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI()); - Value *ReturnAddress = IRB.CreateCall( - Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress), - IRB.getInt32(0)); + Value *ReturnAddress = + IRB.CreateCall(Intrinsic::getOrInsertDeclaration( + F.getParent(), Intrinsic::returnaddress), + IRB.getInt32(0)); IRB.CreateCall(TsanFuncEntry, ReturnAddress); EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions); diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h index c11691c613ac..0dedd0207571 100644 --- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h +++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h @@ -139,7 +139,7 @@ private: if (Decl) return Decl; - return Decl = Intrinsic::getDeclaration(TheModule, IntID); + return Decl = Intrinsic::getOrInsertDeclaration(TheModule, IntID); } }; diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp index 60fd2a286119..9317e0643079 100644 --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -380,7 +380,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, case Intrinsic::masked_load: { Type *DestTy = II->getType(); Type *SrcTy = NewV->getType(); - Function *NewDecl = Intrinsic::getDeclaration(M, IID, {DestTy, SrcTy}); + Function *NewDecl = + Intrinsic::getOrInsertDeclaration(M, IID, {DestTy, SrcTy}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return true; @@ -391,7 +392,8 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, case Intrinsic::masked_gather: { Type *RetTy = II->getType(); Type *NewPtrTy = NewV->getType(); - Function *NewDecl = Intrinsic::getDeclaration(M, IID, {RetTy, NewPtrTy}); + Function *NewDecl = + Intrinsic::getOrInsertDeclaration(M, IID, {RetTy, NewPtrTy}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return true; @@ -400,16 +402,16 @@ bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II, case Intrinsic::masked_scatter: { Type *ValueTy = II->getOperand(0)->getType(); Type *NewPtrTy = NewV->getType(); - Function *NewDecl = - Intrinsic::getDeclaration(M, II->getIntrinsicID(), {ValueTy, NewPtrTy}); + Function *NewDecl = Intrinsic::getOrInsertDeclaration( + M, II->getIntrinsicID(), {ValueTy, NewPtrTy}); II->setArgOperand(1, NewV); II->setCalledFunction(NewDecl); return true; } case Intrinsic::prefetch: case Intrinsic::is_constant: { - Function *NewDecl = - Intrinsic::getDeclaration(M, II->getIntrinsicID(), {NewV->getType()}); + Function *NewDecl = Intrinsic::getOrInsertDeclaration( + M, II->getIntrinsicID(), {NewV->getType()}); II->setArgOperand(0, NewV); II->setCalledFunction(NewDecl); return true; diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index d85166e518f1..4043c0e9a7dd 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -405,7 +405,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { IRBuilder<> Builder(P.InsertPt); Module *M = BB->getParent()->getParent(); Type *I32 = Type::getInt32Ty(BB->getContext()); - Function *PrefetchFunc = Intrinsic::getDeclaration( + Function *PrefetchFunc = Intrinsic::getOrInsertDeclaration( M, Intrinsic::prefetch, PrefPtrValue->getType()); Builder.CreateCall( PrefetchFunc, diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp index d5e91d3c1dec..30369ed7c245 100644 --- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -978,8 +978,8 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI, assert(match(Br->getCondition(), m_Zero()) && "Expected branch condition to be false"); IRBuilder<> Builder(Br); - Function *F = Intrinsic::getDeclaration(M, Intrinsic::umul_with_overflow, - FI.OuterTripCount->getType()); + Function *F = Intrinsic::getOrInsertDeclaration( + M, Intrinsic::umul_with_overflow, FI.OuterTripCount->getType()); Value *Call = Builder.CreateCall(F, {FI.OuterTripCount, FI.InnerTripCount}, "flatten.mul"); FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount"); diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 740e1e39b9ee..56006d9ae692 100644 --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -2122,7 +2122,7 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, Type *Tys[] = {Val->getType()}; Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys); + Function *Func = Intrinsic::getOrInsertDeclaration(M, Intrinsic::ctpop, Tys); CallInst *CI = IRBuilder.CreateCall(Func, Ops); CI->setDebugLoc(DL); @@ -2136,7 +2136,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val, Type *Tys[] = {Val->getType()}; Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); - Function *Func = Intrinsic::getDeclaration(M, IID, Tys); + Function *Func = Intrinsic::getOrInsertDeclaration(M, IID, Tys); CallInst *CI = IRBuilder.CreateCall(Func, Ops); CI->setDebugLoc(DL); diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp index a59ecdda1746..ce35349376c4 100644 --- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp +++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -44,7 +44,7 @@ static bool lowerGuardIntrinsic(Function &F) { if (ToLower.empty()) return false; - auto *DeoptIntrinsic = Intrinsic::getDeclaration( + auto *DeoptIntrinsic = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()}); DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 0d98e844cf91..a4ab288b1bfe 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1290,7 +1290,7 @@ public: if (AllowContraction) { // Use fmuladd for floating point operations and let the backend decide // if that's profitable. - Function *FMulAdd = Intrinsic::getDeclaration( + Function *FMulAdd = Intrinsic::getOrInsertDeclaration( Func.getParent(), Intrinsic::fmuladd, A->getType()); return Builder.CreateCall(FMulAdd, {A, B, Sum}); } diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp index aea17aa82a88..b9f88ba4e078 100644 --- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp +++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp @@ -69,7 +69,7 @@ static bool explicifyGuards(Function &F) { if (GuardIntrinsics.empty()) return false; - auto *DeoptIntrinsic = Intrinsic::getDeclaration( + auto *DeoptIntrinsic = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()}); DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv()); diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index b568811dcdbc..557a75e8946d 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1855,8 +1855,8 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) { // If not, then we know we can transform this. Type *ArgTys[3] = {M->getRawDest()->getType(), M->getRawSource()->getType(), M->getLength()->getType()}; - M->setCalledFunction( - Intrinsic::getDeclaration(M->getModule(), Intrinsic::memcpy, ArgTys)); + M->setCalledFunction(Intrinsic::getOrInsertDeclaration( + M->getModule(), Intrinsic::memcpy, ArgTys)); // For MemorySSA nothing really changes (except that memcpy may imply stricter // aliasing guarantees). diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index e3c12c971b9a..daf8fa28a71e 100644 --- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -1525,8 +1525,8 @@ static void CreateGCRelocates(ArrayRef LiveVariables, if (auto *VT = dyn_cast(Ty)) NewTy = FixedVectorType::get(NewTy, cast(VT)->getNumElements()); - return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate, - {NewTy}); + return Intrinsic::getOrInsertDeclaration( + M, Intrinsic::experimental_gc_relocate, {NewTy}); }; // Lazily populated map from input types to the canonicalized form mentioned diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 72728c0f839e..b1e4c7e52d99 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -733,7 +733,8 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { ValueVector Res(VS->NumFragments); ValueVector ScalarCallOps(NumArgs); - Function *NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + Function *NewIntrin = + Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); IRBuilder<> Builder(&CI); // Perform actual scalarization, taking care to preserve any scalar operands. @@ -756,7 +757,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { } if (IsRemainder) - NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + NewIntrin = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys); Res[I] = Builder.CreateCall(NewIntrin, ScalarCallOps, CI.getName() + ".i" + Twine(I)); diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp index 3cf68e07da5b..e1dd20478fd5 100644 --- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp +++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp @@ -225,7 +225,8 @@ struct AssumeBuilderState { return nullptr; if (!DebugCounter::shouldExecute(BuildAssumeCounter)) return nullptr; - Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume); + Function *FnAssume = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::assume); LLVMContext &C = M->getContext(); SmallVector OpBundle; for (auto &MapElem : AssumedKnowledgeMap) { diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index fc03643e3542..c6ba85bd9e57 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -425,8 +425,8 @@ PruningFunctionCloner::cloneInstruction(BasicBlock::const_iterator II) { // Create intrinsic call. LLVMContext &Ctx = NewFunc->getContext(); - Function *IFn = - Intrinsic::getDeclaration(NewFunc->getParent(), CIID, TParams); + Function *IFn = Intrinsic::getOrInsertDeclaration(NewFunc->getParent(), + CIID, TParams); SmallVector Args; unsigned NumOperands = OldInst.getNumOperands(); if (isa(OldInst)) diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp index f58448dd9562..a090c5ed7492 100644 --- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp +++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp @@ -1124,7 +1124,8 @@ static void insertLifetimeMarkersSurroundingCall( TheCall->getFunction()) && "Input memory not defined in original function"); - Function *Func = Intrinsic::getDeclaration(M, MarkerFunc, Mem->getType()); + Function *Func = + Intrinsic::getOrInsertDeclaration(M, MarkerFunc, Mem->getType()); auto Marker = CallInst::Create(Func, {NegativeOne, Mem}); if (InsertBefore) Marker->insertBefore(TheCall); diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp index d12c540f9a4d..47bb31905d1a 100644 --- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp +++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp @@ -63,7 +63,7 @@ static void insertCall(Function &CurFn, StringRef Func, Func, FunctionType::get(Type::getVoidTy(C), ArgTypes, false)); Instruction *RetAddr = CallInst::Create( - Intrinsic::getDeclaration(&M, Intrinsic::returnaddress), + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress), ArrayRef(ConstantInt::get(Type::getInt32Ty(C), 0)), "", InsertionPt); RetAddr->setDebugLoc(DL); diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 671b0d0822a5..110fd6de5c69 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -2090,7 +2090,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, if (IsUnsafeClaimRV) { Builder.SetInsertPoint(II); Function *IFn = - Intrinsic::getDeclaration(Mod, Intrinsic::objc_release); + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_release); Builder.CreateCall(IFn, RetOpnd, ""); } II->eraseFromParent(); @@ -2125,7 +2125,8 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind, // matching autoreleaseRV or an annotated call in the callee. Emit a call // to objc_retain. Builder.SetInsertPoint(RI); - Function *IFn = Intrinsic::getDeclaration(Mod, Intrinsic::objc_retain); + Function *IFn = + Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_retain); Builder.CreateCall(IFn, RetOpnd, ""); } } @@ -3021,7 +3022,7 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, }); } else { SmallVector NormalReturns; - Function *NewDeoptIntrinsic = Intrinsic::getDeclaration( + Function *NewDeoptIntrinsic = Intrinsic::getOrInsertDeclaration( Caller->getParent(), Intrinsic::experimental_deoptimize, {Caller->getReturnType()}); diff --git a/llvm/lib/Transforms/Utils/IntegerDivision.cpp b/llvm/lib/Transforms/Utils/IntegerDivision.cpp index 11956816a6ec..e95a7a9ae525 100644 --- a/llvm/lib/Transforms/Utils/IntegerDivision.cpp +++ b/llvm/lib/Transforms/Utils/IntegerDivision.cpp @@ -157,8 +157,8 @@ static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor, BasicBlock *IBB = Builder.GetInsertBlock(); Function *F = IBB->getParent(); - Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz, - DivTy); + Function *CTLZ = + Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz, DivTy); // Our CFG is going to look like: // +---------------------+ diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index f3b8623ebb0f..06813bac7c78 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -4141,7 +4141,8 @@ bool llvm::recognizeBSwapOrBitReverseIdiom( else return false; - Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy); + Function *F = + Intrinsic::getOrInsertDeclaration(I->getModule(), Intrin, DemandedTy); Value *Provider = Res->Provider; // We may need to truncate the provider. diff --git a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp index 55f9400d93d7..cd7960065703 100644 --- a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp +++ b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp @@ -215,8 +215,8 @@ static bool runImpl(Module &M) { // If `__cxa_atexit` hits out-of-memory, trap, so that we don't misbehave. // This should be very rare, because if the process is running out of // memory before main has even started, something is wrong. - CallInst::Create(Intrinsic::getDeclaration(&M, Intrinsic::trap), "", - FailBB); + CallInst::Create(Intrinsic::getOrInsertDeclaration(&M, Intrinsic::trap), + "", FailBB); new UnreachableInst(C, FailBB); ReturnInst::Create(C, RetBB); diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp index 1cb1a7b396ba..77abf160dc70 100644 --- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp +++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp @@ -268,7 +268,7 @@ bool isLifetimeIntrinsic(Value *V) { Value *readRegister(IRBuilder<> &IRB, StringRef Name) { Module *M = IRB.GetInsertBlock()->getParent()->getParent(); - Function *ReadRegister = Intrinsic::getDeclaration( + Function *ReadRegister = Intrinsic::getOrInsertDeclaration( M, Intrinsic::read_register, IRB.getIntPtrTy(M->getDataLayout())); MDNode *MD = MDNode::get(M->getContext(), {MDString::get(M->getContext(), Name)}); @@ -287,7 +287,7 @@ Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) { Value *getFP(IRBuilder<> &IRB) { Function *F = IRB.GetInsertBlock()->getParent(); Module *M = F->getParent(); - auto *GetStackPointerFn = Intrinsic::getDeclaration( + auto *GetStackPointerFn = Intrinsic::getOrInsertDeclaration( M, Intrinsic::frameaddress, IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace())); return IRB.CreatePtrToInt( @@ -301,7 +301,7 @@ Value *getAndroidSlotPtr(IRBuilder<> &IRB, int Slot) { // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER // in Bionic's libc/private/bionic_tls.h. Function *ThreadPointerFunc = - Intrinsic::getDeclaration(M, Intrinsic::thread_pointer); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::thread_pointer); return IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc), 8 * Slot); } diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp index 186e17e166ba..2415118cad6f 100644 --- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp +++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp @@ -559,7 +559,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, if (isa(ValInfo)) { IRBuilder<> B(getBranchTerminator(ValInfo)); auto NumDecls = F.getParent()->getNumNamedValues(); - Function *IF = Intrinsic::getDeclaration( + Function *IF = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::ssa_copy, Op->getType()); if (NumDecls != F.getParent()->getNumNamedValues()) PI.CreatedDeclarations.insert(IF); @@ -575,7 +575,7 @@ Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter, // directly before it, assume(i1 true) is not a useful fact. IRBuilder<> B(PAssume->AssumeInst->getNextNode()); auto NumDecls = F.getParent()->getNumNamedValues(); - Function *IF = Intrinsic::getDeclaration( + Function *IF = Intrinsic::getOrInsertDeclaration( F.getParent(), Intrinsic::ssa_copy, Op->getType()); if (NumDecls != F.getParent()->getNumNamedValues()) PI.CreatedDeclarations.insert(IF); diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 1b7912fdf5e3..656bb1ebd116 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -444,7 +444,7 @@ private: /// Given a LoadInst LI this adds assume(LI != null) after it. static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) { Function *AssumeIntrinsic = - Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume); + Intrinsic::getOrInsertDeclaration(LI->getModule(), Intrinsic::assume); ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI, Constant::getNullValue(LI->getType())); LoadNotNull->insertAfter(LI); diff --git a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp index 6e84965370b2..2700b4307308 100644 --- a/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp +++ b/llvm/lib/Transforms/Utils/RelLookupTableConverter.cpp @@ -151,7 +151,7 @@ static void convertToRelLookupTable(GlobalVariable &LookupTable) { // GEP might not be immediately followed by a LOAD, like it can be hoisted // outside the loop or another instruction might be inserted them in between. Builder.SetInsertPoint(Load); - Function *LoadRelIntrinsic = llvm::Intrinsic::getDeclaration( + Function *LoadRelIntrinsic = llvm::Intrinsic::getOrInsertDeclaration( &M, Intrinsic::load_relative, {Index->getType()}); // Create a call to load.relative intrinsic that computes the target address diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp index 1ff3cd78aa98..de1864ef5b8d 100644 --- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp +++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp @@ -2134,8 +2134,8 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR, MulV = TruncTripCount; OfMul = ConstantInt::getFalse(MulV->getContext()); } else { - auto *MulF = Intrinsic::getDeclaration(Loc->getModule(), - Intrinsic::umul_with_overflow, Ty); + auto *MulF = Intrinsic::getOrInsertDeclaration( + Loc->getModule(), Intrinsic::umul_with_overflow, Ty); CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul"); MulV = Builder.CreateExtractValue(Mul, 0, "mul.result"); diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp index e06ebb691d51..db2acb9eed09 100644 --- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1960,7 +1960,7 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B, if (IsIntrinsic) { Module *M = CI->getModule(); Intrinsic::ID IID = CalleeFn->getIntrinsicID(); - Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy()); + Function *Fn = Intrinsic::getOrInsertDeclaration(M, IID, B.getFloatTy()); R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]); } else { AttributeList CalleeAttrs = CalleeFn->getAttributes(); diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e2958c49b8ca..5c164075e832 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -15079,7 +15079,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) { false /*HasGlobalPred*/); CF = VFDatabase(*CI).getVectorizedFunction(Shape); } else { - CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl); + CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl); } SmallVector OpBundles; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ba94cd295876..2948ecc580ed 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -984,7 +984,7 @@ void VPWidenIntrinsicRecipe::execute(VPTransformState &State) { // Use vector version of the intrinsic. Module *M = State.Builder.GetInsertBlock()->getModule(); Function *VectorF = - Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl); + Intrinsic::getOrInsertDeclaration(M, VectorIntrinsicID, TysForDecl); assert(VectorF && "Can't retrieve vector intrinsic."); auto *CI = cast_or_null(getUnderlyingValue()); diff --git a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp index fb8729c36a6f..0e2a6decfbc9 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceOpcodes.cpp @@ -30,7 +30,7 @@ static bool shouldIgnoreArgument(const Value *V) { static Value *replaceIntrinsic(Module &M, IntrinsicInst *II, Intrinsic::ID NewIID, ArrayRef Tys = {}) { - Function *NewFunc = Intrinsic::getDeclaration(&M, NewIID, Tys); + Function *NewFunc = Intrinsic::getOrInsertDeclaration(&M, NewIID, Tys); II->setCalledFunction(NewFunc); return II; } diff --git a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp index 6437e0c9491f..8ad15ca41510 100644 --- a/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp +++ b/llvm/unittests/Analysis/AssumeBundleQueriesTest.cpp @@ -430,7 +430,8 @@ static void RunRandTest(uint64_t Seed, int Size, int MinCount, int MaxCount, BB->insertInto(F); Instruction *Ret = ReturnInst::Create(C); Ret->insertInto(BB, BB->begin()); - Function *FnAssume = Intrinsic::getDeclaration(Mod.get(), Intrinsic::assume); + Function *FnAssume = + Intrinsic::getOrInsertDeclaration(Mod.get(), Intrinsic::assume); std::vector ShuffledArgs; BitVector HasArg; diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp index 9e6c517242a2..81784bb23609 100644 --- a/llvm/unittests/Analysis/MemorySSATest.cpp +++ b/llvm/unittests/Analysis/MemorySSATest.cpp @@ -1120,7 +1120,7 @@ TEST_F(MemorySSATest, LifetimeMarkersAreClobbers) { B.CreateStore(B.getInt8(0), Bar); auto GetLifetimeIntrinsic = [&](Intrinsic::ID ID) { - return Intrinsic::getDeclaration(&M, ID, {Foo->getType()}); + return Intrinsic::getOrInsertDeclaration(&M, ID, {Foo->getType()}); }; B.CreateCall(GetLifetimeIntrinsic(Intrinsic::lifetime_end), diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp index 77d966155dce..0145ee70a14c 100644 --- a/llvm/unittests/Analysis/ValueTrackingTest.cpp +++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp @@ -2481,8 +2481,8 @@ TEST_F(ComputeKnownBitsTest, ComputeKnownBitsAddWithRange) { TEST_F(ComputeKnownBitsTest, ComputeKnownBitsUnknownVScale) { Module M("", Context); IRBuilder<> Builder(Context); - Function *TheFn = - Intrinsic::getDeclaration(&M, Intrinsic::vscale, {Builder.getInt32Ty()}); + Function *TheFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::vscale, + {Builder.getInt32Ty()}); CallInst *CI = Builder.CreateCall(TheFn, {}, {}, ""); KnownBits Known = computeKnownBits(CI, M.getDataLayout(), /* Depth */ 0); diff --git a/llvm/unittests/IR/BasicBlockTest.cpp b/llvm/unittests/IR/BasicBlockTest.cpp index eea2746a352a..88ac6611742c 100644 --- a/llvm/unittests/IR/BasicBlockTest.cpp +++ b/llvm/unittests/IR/BasicBlockTest.cpp @@ -109,8 +109,10 @@ TEST(BasicBlockTest, TestInstructionsWithoutDebug) { Argument *V = new Argument(Type::getInt32Ty(Ctx)); Function *F = Function::Create(FT, Function::ExternalLinkage, "", M); - Function *DbgDeclare = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare); - Function *DbgValue = Intrinsic::getDeclaration(M, Intrinsic::dbg_value); + Function *DbgDeclare = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare); + Function *DbgValue = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_value); Value *DIV = MetadataAsValue::get(Ctx, (Metadata *)nullptr); SmallVector Args = {DIV, DIV, DIV}; @@ -174,7 +176,7 @@ class InstrOrderInvalidationTest : public ::testing::Test { protected: void SetUp() override { M.reset(new Module("MyModule", Ctx)); - Nop = Intrinsic::getDeclaration(M.get(), Intrinsic::donothing); + Nop = Intrinsic::getOrInsertDeclaration(M.get(), Intrinsic::donothing); FunctionType *FT = FunctionType::get(Type::getVoidTy(Ctx), {}, false); Function *F = Function::Create(FT, Function::ExternalLinkage, "foo", *M); BB = BasicBlock::Create(Ctx, "entry", F); diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp index 953df224e84d..ea20c87d6b09 100644 --- a/llvm/unittests/IR/DebugInfoTest.cpp +++ b/llvm/unittests/IR/DebugInfoTest.cpp @@ -693,7 +693,8 @@ TEST(IRBuilder, GetSetInsertionPointWithEmptyBasicBlock) { std::unique_ptr BB(BasicBlock::Create(C, "start")); Module *M = new Module("module", C); IRBuilder<> Builder(BB.get()); - Function *DbgDeclare = Intrinsic::getDeclaration(M, Intrinsic::dbg_declare); + Function *DbgDeclare = + Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_declare); Value *DIV = MetadataAsValue::get(C, (Metadata *)nullptr); SmallVector Args = {DIV, DIV, DIV}; Builder.CreateCall(DbgDeclare, Args); diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index d5239f21147c..690af62d1802 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -413,8 +413,9 @@ TEST_F(IRBuilderTest, ConstrainedFPIntrinsics) { Builder.setDefaultConstrainedExcept(fp::ebStrict); Builder.setDefaultConstrainedRounding(RoundingMode::TowardZero); - Function *Fn = Intrinsic::getDeclaration(M.get(), - Intrinsic::experimental_constrained_roundeven, { Type::getDoubleTy(Ctx) }); + Function *Fn = Intrinsic::getOrInsertDeclaration( + M.get(), Intrinsic::experimental_constrained_roundeven, + {Type::getDoubleTy(Ctx)}); V = Builder.CreateConstrainedFPCall(Fn, { VDouble }); CII = cast(V); EXPECT_EQ(Intrinsic::experimental_constrained_roundeven, CII->getIntrinsicID()); diff --git a/llvm/unittests/IR/IntrinsicsTest.cpp b/llvm/unittests/IR/IntrinsicsTest.cpp index 0c4af28a2ab5..7fe0bd79b80a 100644 --- a/llvm/unittests/IR/IntrinsicsTest.cpp +++ b/llvm/unittests/IR/IntrinsicsTest.cpp @@ -50,7 +50,7 @@ public: Instruction *makeIntrinsic(Intrinsic::ID ID) const { IRBuilder<> Builder(BB); SmallVector ProcessedArgs; - auto *Decl = Intrinsic::getDeclaration(M.get(), ID); + auto *Decl = Intrinsic::getOrInsertDeclaration(M.get(), ID); for (auto *Ty : Decl->getFunctionType()->params()) { auto *Val = Constant::getNullValue(Ty); ProcessedArgs.push_back(Val); diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp index 13f121a2b9c7..7dc4b9f448d3 100644 --- a/llvm/unittests/IR/PatternMatch.cpp +++ b/llvm/unittests/IR/PatternMatch.cpp @@ -1766,7 +1766,7 @@ TEST_F(PatternMatchTest, IntrinsicMatcher) { Value *Ops[] = {Name, Hash, Num, Index, Step}; Module *M = BB->getParent()->getParent(); Function *TheFn = - Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step); + Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step); Value *Intrinsic5 = CallInst::Create(TheFn, Ops, "", BB); diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp index 925a69bafa07..d6ad7599ce46 100644 --- a/llvm/unittests/IR/VPIntrinsicTest.cpp +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -420,7 +420,7 @@ TEST_F(VPIntrinsicTest, VPToNonPredIntrinsicRoundTrip) { ASSERT_TRUE(IsFullTrip); } -/// Check that VPIntrinsic::getDeclarationForParams works. +/// Check that VPIntrinsic::getOrInsertDeclarationForParams works. TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) { std::unique_ptr M = createVPDeclarationModule(); assert(M); @@ -436,7 +436,7 @@ TEST_F(VPIntrinsicTest, VPIntrinsicDeclarationForParams) { Values.push_back(UndefValue::get(ParamTy)); ASSERT_NE(F.getIntrinsicID(), Intrinsic::not_intrinsic); - auto *NewDecl = VPIntrinsic::getDeclarationForParams( + auto *NewDecl = VPIntrinsic::getOrInsertDeclarationForParams( OutM.get(), F.getIntrinsicID(), FuncTy->getReturnType(), Values); ASSERT_TRUE(NewDecl); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index 372c5aaea593..376b00224eb5 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -1195,7 +1195,8 @@ TEST(VPRecipeTest, MayHaveSideEffectsAndMayReadWriteMemory) { // Test for a call to a function without side-effects. LLVMContext C; Module M("", C); - Function *TheFn = Intrinsic::getDeclaration(&M, Intrinsic::thread_pointer); + Function *TheFn = + Intrinsic::getOrInsertDeclaration(&M, Intrinsic::thread_pointer); auto *Call = CallInst::Create(TheFn->getFunctionType(), TheFn); VPValue Op1; diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td index 5031426033ae..448a171cf3e4 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td @@ -347,7 +347,7 @@ class LLVM_ConstrainedIntrgetType(); }); llvm::Module *module = builder.GetInsertBlock()->getModule(); llvm::Function *callee = - llvm::Intrinsic::getDeclaration(module, + llvm::Intrinsic::getOrInsertDeclaration(module, llvm::Intrinsic::experimental_constrained_}] # mnem # [{, overloadedTypes); }] # !cond(!gt(hasRoundingMode, 0) : [{ @@ -541,7 +541,7 @@ class LLVM_DbgIntrOp traits = []> llvm::Module *module = builder.GetInsertBlock()->getModule(); llvm::LLVMContext &ctx = module->getContext(); llvm::Function *fn = - llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::}] + llvm::Intrinsic::getOrInsertDeclaration(module, llvm::Intrinsic::}] # !subst(".", "_", name) # [{); builder.CreateCall(fn, { llvm::MetadataAsValue::get(ctx, @@ -594,7 +594,7 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> { llvm::Module *module = builder.GetInsertBlock()->getModule(); llvm::LLVMContext &ctx = module->getContext(); llvm::Function *fn = - llvm::Intrinsic::getDeclaration(module, llvm::Intrinsic::dbg_label); + llvm::Intrinsic::getOrInsertDeclaration(module, llvm::Intrinsic::dbg_label); builder.CreateCall(fn, { llvm::MetadataAsValue::get(ctx, moduleTranslation.translateDebugInfo($label)) }); diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp index 46b7b0a473c6..a8595d14ccf2 100644 --- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp @@ -99,7 +99,8 @@ getOverloadedDeclaration(CallIntrinsicOp op, llvm::Intrinsic::ID id, } ArrayRef overloadedArgTysRef = overloadedArgTys; - return llvm::Intrinsic::getDeclaration(module, id, overloadedArgTysRef); + return llvm::Intrinsic::getOrInsertDeclaration(module, id, + overloadedArgTysRef); } static llvm::OperandBundleDef @@ -143,7 +144,7 @@ convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder, return failure(); fn = *fnOrFailure; } else { - fn = llvm::Intrinsic::getDeclaration(module, id, {}); + fn = llvm::Intrinsic::getOrInsertDeclaration(module, id, {}); } // Check the result type of the call. diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index a5de90160c41..add0a31c114f 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -839,7 +839,8 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall( llvm::IRBuilderBase &builder, llvm::Intrinsic::ID intrinsic, ArrayRef args, ArrayRef tys) { llvm::Module *module = builder.GetInsertBlock()->getModule(); - llvm::Function *fn = llvm::Intrinsic::getDeclaration(module, intrinsic, tys); + llvm::Function *fn = + llvm::Intrinsic::getOrInsertDeclaration(module, intrinsic, tys); return builder.CreateCall(fn, args); } @@ -886,8 +887,8 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall( for (unsigned overloadedOperandIdx : overloadedOperands) overloadedTypes.push_back(args[overloadedOperandIdx]->getType()); llvm::Module *module = builder.GetInsertBlock()->getModule(); - llvm::Function *llvmIntr = - llvm::Intrinsic::getDeclaration(module, intrinsic, overloadedTypes); + llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration( + module, intrinsic, overloadedTypes); return builder.CreateCall(llvmIntr, args); } diff --git a/polly/lib/CodeGen/IslExprBuilder.cpp b/polly/lib/CodeGen/IslExprBuilder.cpp index aaafac14bf80..1688c41c624b 100644 --- a/polly/lib/CodeGen/IslExprBuilder.cpp +++ b/polly/lib/CodeGen/IslExprBuilder.cpp @@ -129,16 +129,16 @@ Value *IslExprBuilder::createBinOp(BinaryOperator::BinaryOps Opc, Value *LHS, Module *M = Builder.GetInsertBlock()->getModule(); switch (Opc) { case Instruction::Add: - F = Intrinsic::getDeclaration(M, Intrinsic::sadd_with_overflow, - {LHS->getType()}); + F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::sadd_with_overflow, + {LHS->getType()}); break; case Instruction::Sub: - F = Intrinsic::getDeclaration(M, Intrinsic::ssub_with_overflow, - {LHS->getType()}); + F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::ssub_with_overflow, + {LHS->getType()}); break; case Instruction::Mul: - F = Intrinsic::getDeclaration(M, Intrinsic::smul_with_overflow, - {LHS->getType()}); + F = Intrinsic::getOrInsertDeclaration(M, Intrinsic::smul_with_overflow, + {LHS->getType()}); break; default: llvm_unreachable("No overflow intrinsic for binary operator found!"); diff --git a/polly/lib/CodeGen/PerfMonitor.cpp b/polly/lib/CodeGen/PerfMonitor.cpp index 3cad8537f3ee..1a7916146854 100644 --- a/polly/lib/CodeGen/PerfMonitor.cpp +++ b/polly/lib/CodeGen/PerfMonitor.cpp @@ -59,7 +59,7 @@ void PerfMonitor::addToGlobalConstructors(Function *Fn) { } Function *PerfMonitor::getRDTSCP() { - return Intrinsic::getDeclaration(M, Intrinsic::x86_rdtscp); + return Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_rdtscp); } PerfMonitor::PerfMonitor(const Scop &S, Module *M) -- GitLab From c84f75966af79a381e27e6ffc9481c1fae2fcb4f Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Fri, 11 Oct 2024 05:38:17 -0700 Subject: [PATCH 029/345] [libc] Fix compilation of new trig functions (#111987) --- libc/src/math/generic/cos.cpp | 2 +- libc/src/math/generic/range_reduction_double_common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp index 923ea96852d8..568b1254c6f0 100644 --- a/libc/src/math/generic/cos.cpp +++ b/libc/src/math/generic/cos.cpp @@ -93,7 +93,7 @@ LLVM_LIBC_FUNCTION(double, cos, (double x)) { } return ans; }; - DoubleDouble sin_k = get_idx_dd(k + 128); + DoubleDouble msin_k = get_idx_dd(k + 128); DoubleDouble cos_k = get_idx_dd(k + 64); #else // Fast look up version, but needs 256-entry table. diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h index e23bbff144be..bcab82f6c9c3 100644 --- a/libc/src/math/generic/range_reduction_double_common.h +++ b/libc/src/math/generic/range_reduction_double_common.h @@ -278,6 +278,7 @@ private: DoubleDouble y_mid; }; +#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS static Float128 range_reduction_small_f128(double x) { constexpr Float128 PI_OVER_128_F128 = { Sign::POS, -133, 0xc90f'daa2'2168'c234'c4c6'628b'80dc'1cd1_u128}; @@ -300,7 +301,6 @@ static Float128 range_reduction_small_f128(double x) { return fputil::quick_mul(y, PI_OVER_128_F128); } -#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS static constexpr Float128 SIN_K_PI_OVER_128_F128[65] = { {Sign::POS, 0, 0}, {Sign::POS, -133, 0xc90a'afbd'1b33'efc9'c539'edcb'fda0'cf2c_u128}, -- GitLab From 26b832a9ec03d0a35baaf00d81f607004fe2a8cf Mon Sep 17 00:00:00 2001 From: Daniel Mokeev Date: Fri, 11 Oct 2024 14:41:47 +0200 Subject: [PATCH 030/345] [RISCV] Add DAG combine to turn (sub (shl X, 8-Y), (shr X, Y)) into orc.b (#111828) This patch generalizes the DAG combine for `(sub (shl X, 8), X) => (orc.b X)` into the more general form of `(sub (shl X, 8 - Y), (srl X, Y)) => (orc.b X)`. Alive2 generalized proof: https://alive2.llvm.org/ce/z/dFcf_n Related issue: https://github.com/llvm/llvm-project/issues/96595 Related PR: https://github.com/llvm/llvm-project/pull/96680 --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 44 ++- llvm/test/CodeGen/RISCV/orc-b-patterns.ll | 372 ++++++++++++++++++++ 2 files changed, 408 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/orc-b-patterns.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 1f9fc984515c..e71c8c3dc1c7 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -13569,8 +13569,10 @@ static SDValue combineSubOfBoolean(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(ISD::ADD, DL, VT, NewLHS, NewRHS); } -// Looks for (sub (shl X, 8), X) where only bits 8, 16, 24, 32, etc. of X are -// non-zero. Replace with orc.b. +// Looks for (sub (shl X, 8-Y), (shr X, Y)) where the Y-th bit in each byte is +// potentially set. It is fine for Y to be 0, meaning that (sub (shl X, 8), X) +// is also valid. Replace with (orc.b X). For example, 0b0000_1000_0000_1000 is +// valid with Y=3, while 0b0000_1000_0000_0100 is not. static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { if (!Subtarget.hasStdExtZbb()) @@ -13584,18 +13586,44 @@ static SDValue combineSubShiftToOrcB(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - if (N0.getOpcode() != ISD::SHL || N0.getOperand(0) != N1 || !N0.hasOneUse()) + if (N0->getOpcode() != ISD::SHL) return SDValue(); - auto *ShAmtC = dyn_cast(N0.getOperand(1)); - if (!ShAmtC || ShAmtC->getZExtValue() != 8) + auto *ShAmtCLeft = dyn_cast(N0.getOperand(1)); + if (!ShAmtCLeft) return SDValue(); + unsigned ShiftedAmount = 8 - ShAmtCLeft->getZExtValue(); - APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0xfe)); - if (!DAG.MaskedValueIsZero(N1, Mask)) + if (ShiftedAmount >= 8) return SDValue(); - return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, N1); + SDValue LeftShiftOperand = N0->getOperand(0); + SDValue RightShiftOperand = N1; + + if (ShiftedAmount != 0) { // Right operand must be a right shift. + if (N1->getOpcode() != ISD::SRL) + return SDValue(); + auto *ShAmtCRight = dyn_cast(N1.getOperand(1)); + if (!ShAmtCRight || ShAmtCRight->getZExtValue() != ShiftedAmount) + return SDValue(); + RightShiftOperand = N1.getOperand(0); + } + + // At least one shift should have a single use. + if (!N0.hasOneUse() && (ShiftedAmount == 0 || !N1.hasOneUse())) + return SDValue(); + + if (LeftShiftOperand != RightShiftOperand) + return SDValue(); + + APInt Mask = APInt::getSplat(VT.getSizeInBits(), APInt(8, 0x1)); + Mask <<= ShiftedAmount; + // Check that X has indeed the right shape (only the Y-th bit can be set in + // every byte). + if (!DAG.MaskedValueIsZero(LeftShiftOperand, ~Mask)) + return SDValue(); + + return DAG.getNode(RISCVISD::ORC_B, SDLoc(N), VT, LeftShiftOperand); } static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, diff --git a/llvm/test/CodeGen/RISCV/orc-b-patterns.ll b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll new file mode 100644 index 000000000000..184e66c14b33 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/orc-b-patterns.ll @@ -0,0 +1,372 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32I +; RUN: llc -mtriple=riscv32 -mattr=+zbb -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefixes=CHECK,RV32ZBB + +define i32 @orc_b_i32_mul255(i32 %x) nounwind { +; RV32I-LABEL: orc_b_i32_mul255: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 4112 +; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_mul255: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 4112 +; RV32ZBB-NEXT: addi a1, a1, 257 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 16843009 + %mul = mul nuw nsw i32 %and, 255 + ret i32 %mul +} + + +define i32 @orc_b_i32_sub_shl8x_x_lsb(i32 %x) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 4112 +; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 4112 +; RV32ZBB-NEXT: addi a1, a1, 257 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 16843009 + %sub = mul nuw i32 %and, 255 + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_lsb_preshifted(i32 %x){ +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: srli a0, a0, 11 +; RV32I-NEXT: lui a1, 16 +; RV32I-NEXT: addi a1, a1, 257 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 8 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_lsb_preshifted: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: srli a0, a0, 11 +; RV32ZBB-NEXT: lui a1, 16 +; RV32ZBB-NEXT: addi a1, a1, 257 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %shr = lshr i32 %x, 11 + %and = and i32 %shr, 16843009 + %sub = mul nuw i32 %and, 255 + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b1(i32 %x) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 8224 +; RV32I-NEXT: addi a1, a1, 514 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 7 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 8224 +; RV32ZBB-NEXT: addi a1, a1, 514 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b2(i32 %x) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b2: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 16448 +; RV32I-NEXT: addi a1, a1, 1028 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 6 +; RV32I-NEXT: srli a0, a0, 2 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b2: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 16448 +; RV32ZBB-NEXT: addi a1, a1, 1028 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 67372036 + %shl = shl i32 %and, 6 + %shr = lshr exact i32 %and, 2 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b3(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b3: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 24672 +; CHECK-NEXT: addi a1, a1, 1542 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 5 +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 101058054 + %shl = shl nuw i32 %and, 5 + %shr = lshr i32 %and, 3 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b4(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 32897 +; CHECK-NEXT: addi a1, a1, -2040 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 4 +; CHECK-NEXT: srli a0, a0, 4 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 134744072 + %shl = shl nuw i32 %and, 4 + %shr = lshr i32 %and, 4 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b5(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b5: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 65793 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 3 +; CHECK-NEXT: srli a0, a0, 5 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 269488144 + %shl = shl nuw i32 %and, 3 + %shr = lshr i32 %and, 5 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b6(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b6: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 131586 +; CHECK-NEXT: addi a1, a1, 32 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 2 +; CHECK-NEXT: srli a0, a0, 6 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 538976288 + %shl = shl nuw i32 %and, 2 + %shr = lshr i32 %and, 6 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b7(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b7: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 263172 +; CHECK-NEXT: addi a1, a1, 64 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: slli a1, a0, 1 +; CHECK-NEXT: srli a0, a0, 7 +; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 1077952576 + %shl = shl nuw i32 %and, 1 + %shr = lshr i32 %and, 7 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_b1_shl_used(i32 %x, ptr %arr) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a2, 8224 +; RV32I-NEXT: addi a2, a2, 514 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a2, a0, 7 +; RV32I-NEXT: srli a3, a0, 1 +; RV32I-NEXT: sub a0, a2, a3 +; RV32I-NEXT: sw a3, 0(a1) +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_shl_used: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a2, 8224 +; RV32ZBB-NEXT: addi a2, a2, 514 +; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: srli a2, a0, 1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: sw a2, 0(a1) +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + store i32 %shr, ptr %arr, align 4 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_b1_srl_used(i32 %x, ptr %arr) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a2, 8224 +; RV32I-NEXT: addi a2, a2, 514 +; RV32I-NEXT: and a0, a0, a2 +; RV32I-NEXT: slli a2, a0, 7 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: sub a0, a2, a0 +; RV32I-NEXT: sw a2, 0(a1) +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_srl_used: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a2, 8224 +; RV32ZBB-NEXT: addi a2, a2, 514 +; RV32ZBB-NEXT: and a0, a0, a2 +; RV32ZBB-NEXT: slli a2, a0, 7 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: sw a2, 0(a1) +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + store i32 %shl, ptr %arr, align 4 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_shl8x_x_b1_not_used(i32 %x, ptr %arr) { +; RV32I-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: lui a1, 8224 +; RV32I-NEXT: addi a1, a1, 514 +; RV32I-NEXT: and a0, a0, a1 +; RV32I-NEXT: slli a1, a0, 7 +; RV32I-NEXT: srli a0, a0, 1 +; RV32I-NEXT: sub a0, a1, a0 +; RV32I-NEXT: ret +; +; RV32ZBB-LABEL: orc_b_i32_sub_shl8x_x_b1_not_used: +; RV32ZBB: # %bb.0: # %entry +; RV32ZBB-NEXT: lui a1, 8224 +; RV32ZBB-NEXT: addi a1, a1, 514 +; RV32ZBB-NEXT: and a0, a0, a1 +; RV32ZBB-NEXT: orc.b a0, a0 +; RV32ZBB-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_shl_used(i32 %x, ptr %arr){ +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_shl_used: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 4112 +; CHECK-NEXT: addi a2, a2, 257 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: slli a2, a0, 8 +; CHECK-NEXT: sub a0, a2, a0 +; CHECK-NEXT: sw a2, 0(a1) +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 16843009 + %shl = shl i32 %and, 8 + store i32 %shl, ptr %arr, align 4 + %sub = mul nuw i32 %and, 255 + ret i32 %sub +} + +define i32 @orc_b_i32_sub_shl8x_x_b1_both_used(i32 %x, ptr %arr) { +; CHECK-LABEL: orc_b_i32_sub_shl8x_x_b1_both_used: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a2, 8224 +; CHECK-NEXT: addi a2, a2, 514 +; CHECK-NEXT: and a0, a0, a2 +; CHECK-NEXT: slli a2, a0, 7 +; CHECK-NEXT: srli a3, a0, 1 +; CHECK-NEXT: sw a2, 0(a1) +; CHECK-NEXT: sub a0, a2, a3 +; CHECK-NEXT: sw a3, 4(a1) +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 33686018 + %shl = shl i32 %and, 7 + %shr = lshr exact i32 %and, 1 + store i32 %shl, ptr %arr, align 4 + %arrayidx1 = getelementptr inbounds i8, ptr %arr, i32 4 + store i32 %shr, ptr %arrayidx1, align 4 + %sub = sub nsw i32 %shl, %shr + ret i32 %sub +} + + +define i32 @orc_b_i32_sub_x_shr8x(i32 %x) { +; CHECK-LABEL: orc_b_i32_sub_x_shr8x: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lui a1, 4112 +; CHECK-NEXT: addi a1, a1, 257 +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: srli a1, a0, 8 +; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: ret +entry: + %and = and i32 %x, 16843009 + %shr = lshr i32 %and, 8 + %sub = sub nsw i32 %and, %shr + ret i32 %sub +} -- GitLab From 9a696b68b735fa01276d16d39370f9102fee4a0b Mon Sep 17 00:00:00 2001 From: Emilio Cota Date: Fri, 11 Oct 2024 08:18:11 -0400 Subject: [PATCH 031/345] Revert "[NVPTX] Prefer prmt.b32 over bfi.b32 (#110766)" This reverts commit 3f9998af4f79e95fe8be615df9d6b898008044b9. It breaks downstream tests with egregious numerical differences. Unfortunately no upstream tests are broken, but the fact that a prior iteration of the commit (pre-optimization) does work with our downstream tests (coming from the Triton repo) supports the claim that the final version of the commit is incorrect. Reverting now so that the original author can evaluate. --- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 31 +- llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 614 +++++++++---------- llvm/test/CodeGen/NVPTX/sext-setcc.ll | 18 +- 3 files changed, 328 insertions(+), 335 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d95f8f214be5..57bc5fe0ac36 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -2332,23 +2332,20 @@ SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op, // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us // to optimize calculation of constant parts. if (VT == MVT::v4i8) { - SDValue PRMT__10 = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), - DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), - DAG.getConstant(0x3340, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - SDValue PRMT32__ = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), - DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), - DAG.getConstant(0x4033, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - SDValue PRMT3210 = DAG.getNode( - NVPTXISD::PRMT, DL, MVT::v4i8, - {PRMT__10, PRMT32__, DAG.getConstant(0x5410, DL, MVT::i32), - DAG.getConstant(NVPTX::PTXPrmtMode::NONE, DL, MVT::i32)}); - return DAG.getNode(ISD::BITCAST, DL, VT, PRMT3210); + SDValue C8 = DAG.getConstant(8, DL, MVT::i32); + SDValue E01 = DAG.getNode( + NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(1), DL, MVT::i32), + DAG.getAnyExtOrTrunc(Op->getOperand(0), DL, MVT::i32), C8, C8); + SDValue E012 = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(2), DL, MVT::i32), + E01, DAG.getConstant(16, DL, MVT::i32), C8); + SDValue E0123 = + DAG.getNode(NVPTXISD::BFI, DL, MVT::i32, + DAG.getAnyExtOrTrunc(Op->getOperand(3), DL, MVT::i32), + E012, DAG.getConstant(24, DL, MVT::i32), C8); + return DAG.getNode(ISD::BITCAST, DL, VT, E0123); } return Op; } diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 84dde539ce4c..96a4359d0ec4 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -101,38 +101,38 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_add( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: add.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; -; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: add.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, %b @@ -143,29 +143,29 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { ; CHECK-LABEL: test_add_imm_0( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 4; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 3; +; CHECK-NEXT: add.s16 %rs4, %rs3, 2; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 16435; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 2; +; CHECK-NEXT: add.s16 %rs6, %rs5, 3; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 4; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> , %a @@ -176,29 +176,29 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { ; CHECK-LABEL: test_add_imm_1( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 4; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 3; +; CHECK-NEXT: add.s16 %rs4, %rs3, 2; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 16435; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r6, %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 2; +; CHECK-NEXT: add.s16 %rs6, %rs5, 3; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 13120; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 21520; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r6, 16, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 4; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r9, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; %r = add <4 x i8> %a, @@ -209,38 +209,38 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_sub( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: sub.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: sub.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; -; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: sub.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: sub.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = sub <4 x i8> %a, %b @@ -251,38 +251,38 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<26>; +; CHECK-NEXT: .reg .b32 %r<27>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.gt.s32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.gt.s32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.gt.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; ; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; -; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 16435; -; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; ; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; -; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; -; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120; -; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; +; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sgt <4 x i8> %a, %b @@ -294,30 +294,30 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_umax( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; ; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; ; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ugt <4 x i8> %a, %b @@ -329,38 +329,38 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_smin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<26>; +; CHECK-NEXT: .reg .b32 %r<27>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.s32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.le.s32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.s32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.s32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.s32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.le.s32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.s32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.s32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.s32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.le.s32 %p4, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 8, 8; -; CHECK-NEXT: bfe.u32 %r13, %r1, 16, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; -; CHECK-NEXT: bfe.u32 %r15, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r13, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r15, %r2, 0, 8; ; CHECK-NEXT: selp.b32 %r16, %r14, %r15, %p4; -; CHECK-NEXT: bfe.u32 %r17, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r17, %r2, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r13, %r17, %p3; -; CHECK-NEXT: prmt.b32 %r19, %r18, %r16, 16435; -; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 8, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; ; CHECK-NEXT: selp.b32 %r21, %r12, %r20, %p2; -; CHECK-NEXT: bfe.u32 %r22, %r2, 0, 8; -; CHECK-NEXT: selp.b32 %r23, %r11, %r22, %p1; -; CHECK-NEXT: prmt.b32 %r24, %r23, %r21, 13120; -; CHECK-NEXT: prmt.b32 %r25, %r24, %r19, 21520; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 16, 8; +; CHECK-NEXT: bfe.u32 %r23, %r2, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r11, %r23, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r22, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; %cmp = icmp sle <4 x i8> %a, %b @@ -372,30 +372,30 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_umin( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.ls.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.ls.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.ls.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.b32 %r11, %r10, %r9, %p4; ; CHECK-NEXT: selp.b32 %r12, %r8, %r7, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; ; CHECK-NEXT: selp.b32 %r14, %r6, %r5, %p2; -; CHECK-NEXT: selp.b32 %r15, %r4, %r3, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.b32 %r16, %r4, %r3, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %cmp = icmp ule <4 x i8> %a, %b @@ -407,35 +407,35 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-LABEL: test_eq( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<23>; +; CHECK-NEXT: .reg .b32 %r<24>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0]; -; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; ; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: setp.eq.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; ; CHECK-NEXT: setp.eq.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; ; CHECK-NEXT: setp.eq.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 16435; -; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; +; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; -; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; -; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 13120; -; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 21520; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; +; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; +; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; ; CHECK-NEXT: ret; %cmp = icmp eq <4 x i8> %a, %b @@ -447,35 +447,35 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-LABEL: test_ne( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<23>; +; CHECK-NEXT: .reg .b32 %r<24>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0]; -; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r4, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4; -; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r7, %r6; -; CHECK-NEXT: bfe.u32 %r8, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r9, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r9, %r1, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r9, %r8; -; CHECK-NEXT: bfe.u32 %r10, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r11, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r11, %r10; -; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r11, %r12, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r3, 8, 8; ; CHECK-NEXT: selp.b32 %r15, %r9, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r13, 16435; -; CHECK-NEXT: bfe.u32 %r17, %r3, 8, 8; +; CHECK-NEXT: bfi.b32 %r16, %r15, %r13, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r3, 16, 8; ; CHECK-NEXT: selp.b32 %r18, %r7, %r17, %p2; -; CHECK-NEXT: bfe.u32 %r19, %r3, 0, 8; -; CHECK-NEXT: selp.b32 %r20, %r5, %r19, %p1; -; CHECK-NEXT: prmt.b32 %r21, %r20, %r18, 13120; -; CHECK-NEXT: prmt.b32 %r22, %r21, %r16, 21520; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r16, 16, 8; +; CHECK-NEXT: bfe.u32 %r20, %r3, 24, 8; +; CHECK-NEXT: selp.b32 %r21, %r5, %r20, %p1; +; CHECK-NEXT: bfi.b32 %r22, %r21, %r19, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r22; ; CHECK-NEXT: ret; %cmp = icmp ne <4 x i8> %a, %b @@ -487,38 +487,38 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-LABEL: test_mul( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<18>; +; CHECK-NEXT: .reg .b32 %r<19>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; -; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; ; CHECK-NEXT: mul.lo.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.u32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; -; CHECK-NEXT: bfe.u32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; ; CHECK-NEXT: mul.lo.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; -; CHECK-NEXT: bfe.u32 %r10, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs8, %r11; ; CHECK-NEXT: mul.lo.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs10, %r13; -; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs11, %r14; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs10, %r14; +; CHECK-NEXT: bfe.u32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r15; ; CHECK-NEXT: mul.lo.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r17; ; CHECK-NEXT: ret; %r = mul <4 x i8> %a, %b @@ -548,13 +548,12 @@ define <4 x i8> @test_or_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: or.b32 %r8, %r6, %r5; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: or.b32 %r8, %r6, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -614,13 +613,12 @@ define <4 x i8> @test_xor_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: xor.b32 %r8, %r6, %r5; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: xor.b32 %r8, %r6, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -680,13 +678,12 @@ define <4 x i8> @test_and_computed(i8 %a) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0]; -; CHECK-NEXT: mov.b32 %r1, 0; -; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 16435; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 0, 13120; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; -; CHECK-NEXT: bfi.b32 %r6, 5, %r5, 8, 8; -; CHECK-NEXT: and.b32 %r8, %r6, %r5; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 0, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 0, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 0, %r3, 24, 8; +; CHECK-NEXT: bfi.b32 %r6, 5, %r4, 8, 8; +; CHECK-NEXT: and.b32 %r8, %r6, %r4; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r8; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> zeroinitializer, i8 %a, i32 0 @@ -929,40 +926,40 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> ; CHECK-LABEL: test_select_cc( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<28>; +; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; ; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; -; CHECK-NEXT: bfe.u32 %r5, %r4, 0, 8; -; CHECK-NEXT: bfe.u32 %r6, %r3, 0, 8; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: bfe.u32 %r6, %r3, 24, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r4, 8, 8; -; CHECK-NEXT: bfe.u32 %r8, %r3, 8, 8; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: bfe.u32 %r8, %r3, 16, 8; ; CHECK-NEXT: setp.ne.u32 %p2, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r4, 16, 8; -; CHECK-NEXT: bfe.u32 %r10, %r3, 16, 8; +; CHECK-NEXT: bfe.u32 %r9, %r4, 8, 8; +; CHECK-NEXT: bfe.u32 %r10, %r3, 8, 8; ; CHECK-NEXT: setp.ne.u32 %p3, %r10, %r9; -; CHECK-NEXT: bfe.u32 %r11, %r4, 24, 8; -; CHECK-NEXT: bfe.u32 %r12, %r3, 24, 8; +; CHECK-NEXT: bfe.u32 %r11, %r4, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r3, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p4, %r12, %r11; -; CHECK-NEXT: bfe.u32 %r13, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r13, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r14, %r1, 0, 8; ; CHECK-NEXT: selp.b32 %r15, %r14, %r13, %p4; -; CHECK-NEXT: bfe.u32 %r16, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r17, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r16, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r17, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r18, %r17, %r16, %p3; -; CHECK-NEXT: prmt.b32 %r19, %r18, %r15, 16435; -; CHECK-NEXT: bfe.u32 %r20, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r21, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r19, %r18, %r15, 8, 8; +; CHECK-NEXT: bfe.u32 %r20, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r21, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r22, %r21, %r20, %p2; -; CHECK-NEXT: bfe.u32 %r23, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r24, %r1, 0, 8; -; CHECK-NEXT: selp.b32 %r25, %r24, %r23, %p1; -; CHECK-NEXT: prmt.b32 %r26, %r25, %r22, 13120; -; CHECK-NEXT: prmt.b32 %r27, %r26, %r19, 21520; +; CHECK-NEXT: bfi.b32 %r23, %r22, %r19, 16, 8; +; CHECK-NEXT: bfe.u32 %r24, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r25, %r1, 24, 8; +; CHECK-NEXT: selp.b32 %r26, %r25, %r24, %p1; +; CHECK-NEXT: bfi.b32 %r27, %r26, %r23, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r27; ; CHECK-NEXT: ret; %cc = icmp ne <4 x i8> %c, %d @@ -1009,32 +1006,32 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, ; CHECK-LABEL: test_select_cc_i8_i32( ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; -; CHECK-NEXT: .reg .b32 %r<26>; +; CHECK-NEXT: .reg .b32 %r<27>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; ; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; ; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1]; ; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0]; -; CHECK-NEXT: setp.ne.s32 %p1, %r3, %r7; -; CHECK-NEXT: setp.ne.s32 %p2, %r4, %r8; -; CHECK-NEXT: setp.ne.s32 %p3, %r5, %r9; -; CHECK-NEXT: setp.ne.s32 %p4, %r6, %r10; -; CHECK-NEXT: bfe.u32 %r11, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r12, %r1, 24, 8; +; CHECK-NEXT: setp.ne.s32 %p1, %r6, %r10; +; CHECK-NEXT: setp.ne.s32 %p2, %r5, %r9; +; CHECK-NEXT: setp.ne.s32 %p3, %r4, %r8; +; CHECK-NEXT: setp.ne.s32 %p4, %r3, %r7; +; CHECK-NEXT: bfe.u32 %r11, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8; ; CHECK-NEXT: selp.b32 %r13, %r12, %r11, %p4; -; CHECK-NEXT: bfe.u32 %r14, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r15, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r14, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r15, %r1, 8, 8; ; CHECK-NEXT: selp.b32 %r16, %r15, %r14, %p3; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 16435; -; CHECK-NEXT: bfe.u32 %r18, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 8, 8; +; CHECK-NEXT: bfe.u32 %r18, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r19, %r1, 16, 8; ; CHECK-NEXT: selp.b32 %r20, %r19, %r18, %p2; -; CHECK-NEXT: bfe.u32 %r21, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r22, %r1, 0, 8; -; CHECK-NEXT: selp.b32 %r23, %r22, %r21, %p1; -; CHECK-NEXT: prmt.b32 %r24, %r23, %r20, 13120; -; CHECK-NEXT: prmt.b32 %r25, %r24, %r17, 21520; +; CHECK-NEXT: bfi.b32 %r21, %r20, %r17, 16, 8; +; CHECK-NEXT: bfe.u32 %r22, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r23, %r1, 24, 8; +; CHECK-NEXT: selp.b32 %r24, %r23, %r22, %p1; +; CHECK-NEXT: bfi.b32 %r25, %r24, %r21, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r25; ; CHECK-NEXT: ret; <4 x i32> %c, <4 x i32> %d) #0 { @@ -1047,13 +1044,13 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { ; CHECK-LABEL: test_trunc_2xi32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0]; -; CHECK-NEXT: prmt.b32 %r5, %r3, %r4, 16435; -; CHECK-NEXT: prmt.b32 %r6, %r1, %r2, 13120; -; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 21520; +; CHECK-NEXT: bfi.b32 %r5, %r2, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r6, %r3, %r5, 16, 8; +; CHECK-NEXT: bfi.b32 %r7, %r4, %r6, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i32> %a to <4 x i8> @@ -1063,19 +1060,19 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { ; CHECK-LABEL: test_trunc_2xi64( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; ; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; -; CHECK-NEXT: cvt.u32.u64 %r1, %rd4; -; CHECK-NEXT: cvt.u32.u64 %r2, %rd3; -; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 16435; -; CHECK-NEXT: cvt.u32.u64 %r4, %rd2; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd1; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r4, 13120; -; CHECK-NEXT: prmt.b32 %r7, %r6, %r3, 21520; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd1; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd2; +; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; +; CHECK-NEXT: cvt.u32.u64 %r4, %rd3; +; CHECK-NEXT: bfi.b32 %r5, %r4, %r3, 16, 8; +; CHECK-NEXT: cvt.u32.u64 %r6, %rd4; +; CHECK-NEXT: bfi.b32 %r7, %r6, %r5, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r7; ; CHECK-NEXT: ret; %r = trunc <4 x i64> %a to <4 x i8> @@ -1187,16 +1184,15 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { ; CHECK-LABEL: test_bitcast_4xi8_to_2xhalf( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<2>; -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; -; CHECK-NEXT: mov.b32 %r1, 6; -; CHECK-NEXT: prmt.b32 %r2, %r1, 7, 16435; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; -; CHECK-NEXT: prmt.b32 %r4, %r3, 5, 13120; -; CHECK-NEXT: prmt.b32 %r5, %r4, %r2, 21520; -; CHECK-NEXT: st.param.b32 [func_retval0+0], %r5; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; +; CHECK-NEXT: bfi.b32 %r2, 5, %r1, 8, 8; +; CHECK-NEXT: bfi.b32 %r3, 6, %r2, 16, 8; +; CHECK-NEXT: bfi.b32 %r4, 7, %r3, 24, 8; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r4; ; CHECK-NEXT: ret; %ins.0 = insertelement <4 x i8> undef, i8 %a, i32 0 %ins.1 = insertelement <4 x i8> %ins.0, i8 5, i32 1 @@ -1259,27 +1255,27 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptosi_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptosi_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; -; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs10, %rs7; ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 13120; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 21520; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; +; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; ; CHECK-NEXT: ret; %r = fptosi <4 x half> %a to <4 x i8> @@ -1290,27 +1286,27 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-LABEL: test_fptoui_4xhalf_to_4xi8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<13>; -; CHECK-NEXT: .reg .b32 %r<14>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_fptoui_4xhalf_to_4xi8_param_0]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r3; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; ; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r5; -; CHECK-NEXT: cvt.u32.u16 %r6, %rs6; -; CHECK-NEXT: cvt.u32.u16 %r7, %rs5; -; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; -; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r3; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs5; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs6; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; +; CHECK-NEXT: mov.b32 {%rs7, %rs8}, %r4; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs9, %rs8; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs10, %rs7; ; CHECK-NEXT: mov.b32 %r9, {%rs10, %rs9}; ; CHECK-NEXT: mov.b32 {%rs11, %rs12}, %r9; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs12; -; CHECK-NEXT: cvt.u32.u16 %r11, %rs11; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r10, 13120; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r8, 21520; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs11; +; CHECK-NEXT: bfi.b32 %r11, %r10, %r8, 16, 8; +; CHECK-NEXT: cvt.u32.u16 %r12, %rs12; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r13; ; CHECK-NEXT: ret; %r = fptoui <4 x half> %a to <4 x i8> @@ -1330,33 +1326,33 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r3; -; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; +; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs2, %r4; ; CHECK-NEXT: rem.s16 %rs3, %rs2, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r5, %rs3; -; CHECK-NEXT: bfe.s32 %r6, %r2, 16, 8; +; CHECK-NEXT: bfe.s32 %r6, %r2, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs4, %r6; -; CHECK-NEXT: bfe.s32 %r7, %r1, 16, 8; +; CHECK-NEXT: bfe.s32 %r7, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs5, %r7; ; CHECK-NEXT: rem.s16 %rs6, %rs5, %rs4; ; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r5, 16435; -; CHECK-NEXT: bfe.s32 %r10, %r2, 8, 8; +; CHECK-NEXT: bfi.b32 %r9, %r8, %r5, 8, 8; +; CHECK-NEXT: bfe.s32 %r10, %r2, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs7, %r10; -; CHECK-NEXT: bfe.s32 %r11, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r11, %r1, 16, 8; ; CHECK-NEXT: cvt.s8.s32 %rs8, %r11; ; CHECK-NEXT: rem.s16 %rs9, %rs8, %rs7; ; CHECK-NEXT: cvt.u32.u16 %r12, %rs9; -; CHECK-NEXT: bfe.s32 %r13, %r2, 0, 8; -; CHECK-NEXT: cvt.s8.s32 %rs10, %r13; -; CHECK-NEXT: bfe.s32 %r14, %r1, 0, 8; -; CHECK-NEXT: cvt.s8.s32 %rs11, %r14; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r9, 16, 8; +; CHECK-NEXT: bfe.s32 %r14, %r2, 24, 8; +; CHECK-NEXT: cvt.s8.s32 %rs10, %r14; +; CHECK-NEXT: bfe.s32 %r15, %r1, 24, 8; +; CHECK-NEXT: cvt.s8.s32 %rs11, %r15; ; CHECK-NEXT: rem.s16 %rs12, %rs11, %rs10; -; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 21520; +; CHECK-NEXT: cvt.u32.u16 %r16, %rs12; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r13, 24, 8; ; CHECK-NEXT: st.u32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: @@ -1377,7 +1373,7 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-LABEL: test_srem_v3i8( ; CHECK: { ; CHECK-NEXT: .reg .b16 %rs<20>; -; CHECK-NEXT: .reg .b32 %r<17>; +; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry @@ -1396,25 +1392,25 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs9; ; CHECK-NEXT: ld.s8 %rs10, [%rd2+2]; -; CHECK-NEXT: bfe.s32 %r5, %r3, 8, 8; +; CHECK-NEXT: bfe.s32 %r5, %r3, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs11, %r5; -; CHECK-NEXT: bfe.s32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.s32 %r6, %r1, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs12, %r6; ; CHECK-NEXT: rem.s16 %rs13, %rs12, %rs11; ; CHECK-NEXT: cvt.u32.u16 %r7, %rs13; -; CHECK-NEXT: bfe.s32 %r8, %r3, 0, 8; +; CHECK-NEXT: bfe.s32 %r8, %r3, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs14, %r8; -; CHECK-NEXT: bfe.s32 %r9, %r1, 0, 8; +; CHECK-NEXT: bfe.s32 %r9, %r1, 8, 8; ; CHECK-NEXT: cvt.s8.s32 %rs15, %r9; ; CHECK-NEXT: rem.s16 %rs16, %rs15, %rs14; ; CHECK-NEXT: cvt.u32.u16 %r10, %rs16; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r7, 13120; +; CHECK-NEXT: bfi.b32 %r11, %r10, %r7, 8, 8; ; CHECK-NEXT: // implicit-def: %r13 -; CHECK-NEXT: // implicit-def: %r14 -; CHECK-NEXT: prmt.b32 %r12, %r13, %r14, 16435; -; CHECK-NEXT: prmt.b32 %r15, %r11, %r12, 21520; +; CHECK-NEXT: bfi.b32 %r12, %r13, %r11, 16, 8; +; CHECK-NEXT: // implicit-def: %r15 +; CHECK-NEXT: bfi.b32 %r14, %r15, %r12, 24, 8; ; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10; -; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs18, tmp}, %r15; } +; CHECK-NEXT: cvt.u16.u32 %rs18, %r14; ; CHECK-NEXT: st.u8 [%rd3], %rs18; ; CHECK-NEXT: shr.u16 %rs19, %rs18, 8; ; CHECK-NEXT: st.u8 [%rd3+1], %rs19; @@ -1441,25 +1437,25 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; ; CHECK-NEXT: ld.u32 %r2, [%rd2]; -; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; -; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; -; CHECK-NEXT: bfe.u32 %r5, %r2, 8, 8; -; CHECK-NEXT: bfe.u32 %r6, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: bfe.u32 %r6, %r1, 16, 8; ; CHECK-NEXT: setp.hi.u32 %p2, %r6, %r5; -; CHECK-NEXT: bfe.u32 %r7, %r2, 16, 8; -; CHECK-NEXT: bfe.u32 %r8, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r7, %r2, 8, 8; +; CHECK-NEXT: bfe.u32 %r8, %r1, 8, 8; ; CHECK-NEXT: setp.hi.u32 %p3, %r8, %r7; -; CHECK-NEXT: bfe.u32 %r9, %r2, 24, 8; -; CHECK-NEXT: bfe.u32 %r10, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 0, 8; +; CHECK-NEXT: bfe.u32 %r10, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p4, %r10, %r9; ; CHECK-NEXT: selp.s32 %r11, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r12, -1, 0, %p3; -; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 16435; +; CHECK-NEXT: bfi.b32 %r13, %r12, %r11, 8, 8; ; CHECK-NEXT: selp.s32 %r14, -1, 0, %p2; -; CHECK-NEXT: selp.s32 %r15, -1, 0, %p1; -; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 13120; -; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 21520; +; CHECK-NEXT: bfi.b32 %r15, %r14, %r13, 16, 8; +; CHECK-NEXT: selp.s32 %r16, -1, 0, %p1; +; CHECK-NEXT: bfi.b32 %r17, %r16, %r15, 24, 8; ; CHECK-NEXT: st.u32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll index 8b7e5235443f..f471d47077cf 100644 --- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -33,35 +33,35 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<5>; ; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0]; ; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: and.b16 %rs2, %rs1, 255; ; CHECK-NEXT: setp.eq.s16 %p1, %rs2, 0; -; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8; +; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs3, %r3; ; CHECK-NEXT: and.b16 %rs4, %rs3, 255; ; CHECK-NEXT: setp.eq.s16 %p2, %rs4, 0; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; ; CHECK-NEXT: cvt.u16.u32 %rs5, %r4; ; CHECK-NEXT: and.b16 %rs6, %rs5, 255; ; CHECK-NEXT: setp.eq.s16 %p3, %rs6, 0; -; CHECK-NEXT: bfe.u32 %r5, %r1, 24, 8; +; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs7, %r5; ; CHECK-NEXT: and.b16 %rs8, %rs7, 255; ; CHECK-NEXT: setp.eq.s16 %p4, %rs8, 0; ; CHECK-NEXT: selp.s32 %r6, -1, 0, %p4; ; CHECK-NEXT: selp.s32 %r7, -1, 0, %p3; -; CHECK-NEXT: prmt.b32 %r8, %r7, %r6, 16435; +; CHECK-NEXT: bfi.b32 %r8, %r7, %r6, 8, 8; ; CHECK-NEXT: selp.s32 %r9, -1, 0, %p2; -; CHECK-NEXT: selp.s32 %r10, -1, 0, %p1; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r9, 13120; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r8, 21520; +; CHECK-NEXT: bfi.b32 %r10, %r9, %r8, 16, 8; +; CHECK-NEXT: selp.s32 %r11, -1, 0, %p1; +; CHECK-NEXT: bfi.b32 %r12, %r11, %r10, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0+0], %r12; ; CHECK-NEXT: ret; entry: -- GitLab From c8da2253f9aa4dff039e9ed766ff0f865632a0eb Mon Sep 17 00:00:00 2001 From: Rahul Joshi Date: Fri, 11 Oct 2024 05:45:09 -0700 Subject: [PATCH 032/345] [Clang] Replace Intrinsic::getDeclaration with getOrInsertDeclaration (#111990) Fix build failure from the rename change. Looks like one additional reference sneaked in between pre-commit checks and the commit itself. --- clang/lib/CodeGen/CGBuiltin.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 059c75fae284..465afd04740d 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18882,7 +18882,7 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: { // for the DirectX intrinsic and the demangled builtin name switch (CGM.getTarget().getTriple().getArch()) { case llvm::Triple::dxil: - return EmitRuntimeCall(Intrinsic::getDeclaration( + return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration( &CGM.getModule(), Intrinsic::dx_wave_getlaneindex)); case llvm::Triple::spirv: return EmitRuntimeCall(CGM.CreateRuntimeFunction( -- GitLab From ed7251b3aeb7c471dc50e9409e83a9ec01f40df5 Mon Sep 17 00:00:00 2001 From: Mikhail Goncharov Date: Fri, 11 Oct 2024 14:46:46 +0200 Subject: [PATCH 033/345] Revert "[clang] Implement TTP P0522 pack matching for deduced function template calls. (#111457)" See discussion in https://github.com/llvm/llvm-project/pull/111711 This reverts commit 4dadf42c1a74dd4e37db9ffd6fbb3027f59751a7. --- clang/include/clang/Sema/Overload.h | 10 +-- clang/include/clang/Sema/Sema.h | 23 +++---- clang/lib/Sema/SemaLookup.cpp | 1 - clang/lib/Sema/SemaOverload.cpp | 50 ++++++--------- clang/lib/Sema/SemaTemplate.cpp | 23 ++++--- clang/lib/Sema/SemaTemplateDeduction.cpp | 70 +++++++++----------- clang/test/SemaTemplate/cwg2398.cpp | 81 ------------------------ 7 files changed, 69 insertions(+), 189 deletions(-) diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index d38278c50411..c716a25bb673 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -925,11 +925,6 @@ class Sema; bool TookAddressOfOverload : 1; - /// Have we matched any packs on the parameter side, versus any non-packs on - /// the argument side, in a context where the opposite matching is also - /// allowed? - bool HasMatchedPackOnParmToNonPackOnArg : 1; - /// True if the candidate was found using ADL. CallExpr::ADLCallKind IsADLCandidate : 1; @@ -1004,9 +999,8 @@ class Sema; friend class OverloadCandidateSet; OverloadCandidate() : IsSurrogate(false), IgnoreObjectArgument(false), - TookAddressOfOverload(false), - HasMatchedPackOnParmToNonPackOnArg(false), - IsADLCandidate(CallExpr::NotADL), RewriteKind(CRK_None) {} + TookAddressOfOverload(false), IsADLCandidate(CallExpr::NotADL), + RewriteKind(CRK_None) {} }; /// OverloadCandidateSet - A set of overload candidates, used in C++ diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index f8118ca64ad3..66b0846f286a 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -10134,8 +10134,7 @@ public: ADLCallKind IsADLCandidate = ADLCallKind::NotADL, ConversionSequenceList EarlyConversions = std::nullopt, OverloadCandidateParamOrder PO = {}, - bool AggregateCandidateDeduction = false, - bool HasMatchedPackOnParmToNonPackOnArg = false); + bool AggregateCandidateDeduction = false); /// Add all of the function declarations in the given function set to /// the overload candidate set. @@ -10170,8 +10169,7 @@ public: bool SuppressUserConversions = false, bool PartialOverloading = false, ConversionSequenceList EarlyConversions = std::nullopt, - OverloadCandidateParamOrder PO = {}, - bool HasMatchedPackOnParmToNonPackOnArg = false); + OverloadCandidateParamOrder PO = {}); /// Add a C++ member function template as a candidate to the candidate /// set, using template argument deduction to produce an appropriate member @@ -10217,8 +10215,7 @@ public: CXXConversionDecl *Conversion, DeclAccessPair FoundDecl, CXXRecordDecl *ActingContext, Expr *From, QualType ToType, OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit, - bool AllowExplicit, bool AllowResultConversion = true, - bool HasMatchedPackOnParmToNonPackOnArg = false); + bool AllowExplicit, bool AllowResultConversion = true); /// Adds a conversion function template specialization /// candidate to the overload set, using template argument deduction @@ -11641,7 +11638,7 @@ public: SourceLocation RAngleLoc, unsigned ArgumentPackIndex, SmallVectorImpl &SugaredConverted, SmallVectorImpl &CanonicalConverted, - CheckTemplateArgumentKind CTAK, bool PartialOrdering, + CheckTemplateArgumentKind CTAK, bool *MatchedPackOnParmToNonPackOnArg); /// Check that the given template arguments can be provided to @@ -11724,8 +11721,7 @@ public: /// It returns true if an error occurred, and false otherwise. bool CheckTemplateTemplateArgument(TemplateTemplateParmDecl *Param, TemplateParameterList *Params, - TemplateArgumentLoc &Arg, - bool PartialOrdering, + TemplateArgumentLoc &Arg, bool IsDeduced, bool *MatchedPackOnParmToNonPackOnArg); void NoteTemplateLocation(const NamedDecl &Decl, @@ -12237,8 +12233,8 @@ public: SmallVectorImpl &Deduced, unsigned NumExplicitlySpecified, FunctionDecl *&Specialization, sema::TemplateDeductionInfo &Info, - SmallVectorImpl const *OriginalCallArgs, - bool PartialOverloading, bool PartialOrdering, + SmallVectorImpl const *OriginalCallArgs = nullptr, + bool PartialOverloading = false, llvm::function_ref CheckNonDependent = [] { return false; }); /// Perform template argument deduction from a function call @@ -12272,8 +12268,7 @@ public: TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef Args, FunctionDecl *&Specialization, sema::TemplateDeductionInfo &Info, bool PartialOverloading, bool AggregateDeductionCandidate, - bool PartialOrdering, QualType ObjectType, - Expr::Classification ObjectClassification, + QualType ObjectType, Expr::Classification ObjectClassification, llvm::function_ref)> CheckNonDependent); /// Deduce template arguments when taking the address of a function @@ -12428,7 +12423,7 @@ public: bool isTemplateTemplateParameterAtLeastAsSpecializedAs( TemplateParameterList *PParam, TemplateDecl *PArg, TemplateDecl *AArg, const DefaultArguments &DefaultArgs, SourceLocation ArgLoc, - bool PartialOrdering, bool *MatchedPackOnParmToNonPackOnArg); + bool IsDeduced, bool *MatchedPackOnParmToNonPackOnArg); /// Mark which template parameters are used in a given expression. /// diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index 60fa195221c9..31422c213ac2 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -3667,7 +3667,6 @@ Sema::LookupLiteralOperator(Scope *S, LookupResult &R, if (CheckTemplateArgument( Params->getParam(0), Arg, FD, R.getNameLoc(), R.getNameLoc(), 0, SugaredChecked, CanonicalChecked, CTAK_Specified, - /*PartialOrdering=*/false, /*MatchedPackOnParmToNonPackOnArg=*/nullptr) || Trap.hasErrorOccurred()) IsTemplate = false; diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index f545e9341e1a..2cde8131108f 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -6864,8 +6864,7 @@ void Sema::AddOverloadCandidate( OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, bool PartialOverloading, bool AllowExplicit, bool AllowExplicitConversions, ADLCallKind IsADLCandidate, ConversionSequenceList EarlyConversions, - OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction, - bool HasMatchedPackOnParmToNonPackOnArg) { + OverloadCandidateParamOrder PO, bool AggregateCandidateDeduction) { const FunctionProtoType *Proto = dyn_cast(Function->getType()->getAs()); assert(Proto && "Functions without a prototype cannot be overloaded"); @@ -6884,8 +6883,7 @@ void Sema::AddOverloadCandidate( AddMethodCandidate(Method, FoundDecl, Method->getParent(), QualType(), Expr::Classification::makeSimpleLValue(), Args, CandidateSet, SuppressUserConversions, - PartialOverloading, EarlyConversions, PO, - HasMatchedPackOnParmToNonPackOnArg); + PartialOverloading, EarlyConversions, PO); return; } // We treat a constructor like a non-member function, since its object @@ -6928,8 +6926,6 @@ void Sema::AddOverloadCandidate( CandidateSet.getRewriteInfo().getRewriteKind(Function, PO); Candidate.IsADLCandidate = IsADLCandidate; Candidate.ExplicitCallArguments = Args.size(); - Candidate.HasMatchedPackOnParmToNonPackOnArg = - HasMatchedPackOnParmToNonPackOnArg; // Explicit functions are not actually candidates at all if we're not // allowing them in this context, but keep them around so we can point @@ -7457,13 +7453,16 @@ void Sema::AddMethodCandidate(DeclAccessPair FoundDecl, QualType ObjectType, } } -void Sema::AddMethodCandidate( - CXXMethodDecl *Method, DeclAccessPair FoundDecl, - CXXRecordDecl *ActingContext, QualType ObjectType, - Expr::Classification ObjectClassification, ArrayRef Args, - OverloadCandidateSet &CandidateSet, bool SuppressUserConversions, - bool PartialOverloading, ConversionSequenceList EarlyConversions, - OverloadCandidateParamOrder PO, bool HasMatchedPackOnParmToNonPackOnArg) { +void +Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl, + CXXRecordDecl *ActingContext, QualType ObjectType, + Expr::Classification ObjectClassification, + ArrayRef Args, + OverloadCandidateSet &CandidateSet, + bool SuppressUserConversions, + bool PartialOverloading, + ConversionSequenceList EarlyConversions, + OverloadCandidateParamOrder PO) { const FunctionProtoType *Proto = dyn_cast(Method->getType()->getAs()); assert(Proto && "Methods without a prototype cannot be overloaded"); @@ -7494,8 +7493,6 @@ void Sema::AddMethodCandidate( Candidate.TookAddressOfOverload = CandidateSet.getKind() == OverloadCandidateSet::CSK_AddressOfOverloadSet; Candidate.ExplicitCallArguments = Args.size(); - Candidate.HasMatchedPackOnParmToNonPackOnArg = - HasMatchedPackOnParmToNonPackOnArg; bool IgnoreExplicitObject = (Method->isExplicitObjectMemberFunction() && @@ -7666,8 +7663,8 @@ void Sema::AddMethodTemplateCandidate( ConversionSequenceList Conversions; if (TemplateDeductionResult Result = DeduceTemplateArguments( MethodTmpl, ExplicitTemplateArgs, Args, Specialization, Info, - PartialOverloading, /*AggregateDeductionCandidate=*/false, - /*PartialOrdering=*/false, ObjectType, ObjectClassification, + PartialOverloading, /*AggregateDeductionCandidate=*/false, ObjectType, + ObjectClassification, [&](ArrayRef ParamTypes) { return CheckNonDependentConversions( MethodTmpl, ParamTypes, Args, CandidateSet, Conversions, @@ -7705,8 +7702,7 @@ void Sema::AddMethodTemplateCandidate( AddMethodCandidate(cast(Specialization), FoundDecl, ActingContext, ObjectType, ObjectClassification, Args, CandidateSet, SuppressUserConversions, PartialOverloading, - Conversions, PO, - Info.hasMatchedPackOnParmToNonPackOnArg()); + Conversions, PO); } /// Determine whether a given function template has a simple explicit specifier @@ -7752,7 +7748,6 @@ void Sema::AddTemplateOverloadCandidate( if (TemplateDeductionResult Result = DeduceTemplateArguments( FunctionTemplate, ExplicitTemplateArgs, Args, Specialization, Info, PartialOverloading, AggregateCandidateDeduction, - /*PartialOrdering=*/false, /*ObjectType=*/QualType(), /*ObjectClassification=*/Expr::Classification(), [&](ArrayRef ParamTypes) { @@ -7793,8 +7788,7 @@ void Sema::AddTemplateOverloadCandidate( Specialization, FoundDecl, Args, CandidateSet, SuppressUserConversions, PartialOverloading, AllowExplicit, /*AllowExplicitConversions=*/false, IsADLCandidate, Conversions, PO, - Info.AggregateDeductionCandidateHasMismatchedArity, - Info.hasMatchedPackOnParmToNonPackOnArg()); + Info.AggregateDeductionCandidateHasMismatchedArity); } bool Sema::CheckNonDependentConversions( @@ -7916,8 +7910,7 @@ void Sema::AddConversionCandidate( CXXConversionDecl *Conversion, DeclAccessPair FoundDecl, CXXRecordDecl *ActingContext, Expr *From, QualType ToType, OverloadCandidateSet &CandidateSet, bool AllowObjCConversionOnExplicit, - bool AllowExplicit, bool AllowResultConversion, - bool HasMatchedPackOnParmToNonPackOnArg) { + bool AllowExplicit, bool AllowResultConversion) { assert(!Conversion->getDescribedFunctionTemplate() && "Conversion function templates use AddTemplateConversionCandidate"); QualType ConvType = Conversion->getConversionType().getNonReferenceType(); @@ -7962,8 +7955,6 @@ void Sema::AddConversionCandidate( Candidate.FinalConversion.setAllToTypes(ToType); Candidate.Viable = true; Candidate.ExplicitCallArguments = 1; - Candidate.HasMatchedPackOnParmToNonPackOnArg = - HasMatchedPackOnParmToNonPackOnArg; // Explicit functions are not actually candidates at all if we're not // allowing them in this context, but keep them around so we can point @@ -8165,8 +8156,7 @@ void Sema::AddTemplateConversionCandidate( assert(Specialization && "Missing function template specialization?"); AddConversionCandidate(Specialization, FoundDecl, ActingDC, From, ToType, CandidateSet, AllowObjCConversionOnExplicit, - AllowExplicit, AllowResultConversion, - Info.hasMatchedPackOnParmToNonPackOnArg()); + AllowExplicit, AllowResultConversion); } void Sema::AddSurrogateCandidate(CXXConversionDecl *Conversion, @@ -10519,10 +10509,6 @@ bool clang::isBetterOverloadCandidate( isa(Cand2.Function)) return isa(Cand1.Function); - if (Cand1.HasMatchedPackOnParmToNonPackOnArg != - Cand2.HasMatchedPackOnParmToNonPackOnArg) - return Cand2.HasMatchedPackOnParmToNonPackOnArg; - // -- F1 is a non-template function and F2 is a function template // specialization, or, if not that, bool Cand1IsSpecialization = Cand1.Function && diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp index 62d0d0914fa3..4f13669c2490 100644 --- a/clang/lib/Sema/SemaTemplate.cpp +++ b/clang/lib/Sema/SemaTemplate.cpp @@ -5179,8 +5179,7 @@ bool Sema::CheckTemplateArgument( unsigned ArgumentPackIndex, SmallVectorImpl &SugaredConverted, SmallVectorImpl &CanonicalConverted, - CheckTemplateArgumentKind CTAK, bool PartialOrdering, - bool *MatchedPackOnParmToNonPackOnArg) { + CheckTemplateArgumentKind CTAK, bool *MatchedPackOnParmToNonPackOnArg) { // Check template type parameters. if (TemplateTypeParmDecl *TTP = dyn_cast(Param)) return CheckTemplateTypeArgument(TTP, Arg, SugaredConverted, @@ -5395,7 +5394,8 @@ bool Sema::CheckTemplateArgument( case TemplateArgument::Template: case TemplateArgument::TemplateExpansion: - if (CheckTemplateTemplateArgument(TempParm, Params, Arg, PartialOrdering, + if (CheckTemplateTemplateArgument(TempParm, Params, Arg, + /*IsDeduced=*/CTAK != CTAK_Specified, MatchedPackOnParmToNonPackOnArg)) return true; @@ -5546,11 +5546,10 @@ bool Sema::CheckTemplateArgumentList( if (ArgIdx < NumArgs) { // Check the template argument we were given. - if (CheckTemplateArgument(*Param, NewArgs[ArgIdx], Template, TemplateLoc, - RAngleLoc, SugaredArgumentPack.size(), - SugaredConverted, CanonicalConverted, - CTAK_Specified, /*PartialOrdering=*/false, - MatchedPackOnParmToNonPackOnArg)) + if (CheckTemplateArgument( + *Param, NewArgs[ArgIdx], Template, TemplateLoc, RAngleLoc, + SugaredArgumentPack.size(), SugaredConverted, CanonicalConverted, + CTAK_Specified, MatchedPackOnParmToNonPackOnArg)) return true; CanonicalConverted.back().setIsDefaulted( @@ -5708,7 +5707,7 @@ bool Sema::CheckTemplateArgumentList( // Check the default template argument. if (CheckTemplateArgument(*Param, Arg, Template, TemplateLoc, RAngleLoc, 0, SugaredConverted, CanonicalConverted, - CTAK_Specified, /*PartialOrdering=*/false, + CTAK_Specified, /*MatchedPackOnParmToNonPackOnArg=*/nullptr)) return true; @@ -7294,7 +7293,7 @@ static void DiagnoseTemplateParameterListArityMismatch( bool Sema::CheckTemplateTemplateArgument( TemplateTemplateParmDecl *Param, TemplateParameterList *Params, - TemplateArgumentLoc &Arg, bool PartialOrdering, + TemplateArgumentLoc &Arg, bool IsDeduced, bool *MatchedPackOnParmToNonPackOnArg) { TemplateName Name = Arg.getArgument().getAsTemplateOrTemplatePattern(); auto [Template, DefaultArgs] = Name.getTemplateDeclAndDefaultArgs(); @@ -7339,8 +7338,8 @@ bool Sema::CheckTemplateTemplateArgument( // A template-argument matches a template template-parameter P when P // is at least as specialized as the template-argument A. if (!isTemplateTemplateParameterAtLeastAsSpecializedAs( - Params, Param, Template, DefaultArgs, Arg.getLocation(), - PartialOrdering, MatchedPackOnParmToNonPackOnArg)) + Params, Param, Template, DefaultArgs, Arg.getLocation(), IsDeduced, + MatchedPackOnParmToNonPackOnArg)) return true; // P2113 // C++20[temp.func.order]p2 diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp index e49d315f7186..48a39a90f72a 100644 --- a/clang/lib/Sema/SemaTemplateDeduction.cpp +++ b/clang/lib/Sema/SemaTemplateDeduction.cpp @@ -2955,7 +2955,7 @@ Sema::getIdentityTemplateArgumentLoc(NamedDecl *TemplateParm, /// fully-converted template arguments. static bool ConvertDeducedTemplateArgument( Sema &S, NamedDecl *Param, DeducedTemplateArgument Arg, NamedDecl *Template, - TemplateDeductionInfo &Info, bool IsDeduced, bool PartialOrdering, + TemplateDeductionInfo &Info, bool IsDeduced, SmallVectorImpl &SugaredOutput, SmallVectorImpl &CanonicalOutput) { auto ConvertArg = [&](DeducedTemplateArgument Arg, @@ -2976,7 +2976,7 @@ static bool ConvertDeducedTemplateArgument( ? (Arg.wasDeducedFromArrayBound() ? Sema::CTAK_DeducedFromArrayBound : Sema::CTAK_Deduced) : Sema::CTAK_Specified, - PartialOrdering, &MatchedPackOnParmToNonPackOnArg); + &MatchedPackOnParmToNonPackOnArg); if (MatchedPackOnParmToNonPackOnArg) Info.setMatchedPackOnParmToNonPackOnArg(); return Res; @@ -3062,9 +3062,9 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments( SmallVectorImpl &Deduced, TemplateDeductionInfo &Info, SmallVectorImpl &SugaredBuilder, - SmallVectorImpl &CanonicalBuilder, bool PartialOrdering, - LocalInstantiationScope *CurrentInstantiationScope, - unsigned NumAlreadyConverted, bool *IsIncomplete) { + SmallVectorImpl &CanonicalBuilder, + LocalInstantiationScope *CurrentInstantiationScope = nullptr, + unsigned NumAlreadyConverted = 0, bool *IsIncomplete = nullptr) { TemplateParameterList *TemplateParams = Template->getTemplateParameters(); for (unsigned I = 0, N = TemplateParams->size(); I != N; ++I) { @@ -3107,8 +3107,8 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments( // We may have deduced this argument, so it still needs to be // checked and converted. if (ConvertDeducedTemplateArgument(S, Param, Deduced[I], Template, Info, - IsDeduced, PartialOrdering, - SugaredBuilder, CanonicalBuilder)) { + IsDeduced, SugaredBuilder, + CanonicalBuilder)) { Info.Param = makeTemplateParameter(Param); // FIXME: These template arguments are temporary. Free them! Info.reset( @@ -3174,8 +3174,7 @@ static TemplateDeductionResult ConvertDeducedTemplateArguments( // Check whether we can actually use the default argument. if (S.CheckTemplateArgument( Param, DefArg, TD, TD->getLocation(), TD->getSourceRange().getEnd(), - /*ArgumentPackIndex=*/0, SugaredBuilder, CanonicalBuilder, - Sema::CTAK_Specified, /*PartialOrdering=*/false, + 0, SugaredBuilder, CanonicalBuilder, Sema::CTAK_Specified, /*MatchedPackOnParmToNonPackOnArg=*/nullptr)) { Info.Param = makeTemplateParameter( const_cast(TemplateParams->getParam(I))); @@ -3284,9 +3283,7 @@ FinishTemplateArgumentDeduction( SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( S, Partial, IsPartialOrdering, Deduced, Info, SugaredBuilder, - CanonicalBuilder, IsPartialOrdering, - /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, - /*IsIncomplete=*/nullptr); + CanonicalBuilder); Result != TemplateDeductionResult::Success) return Result; @@ -3386,10 +3383,10 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( // explicitly specified, template argument deduction fails. SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( - S, Template, /*IsDeduced=*/PartialOrdering, Deduced, Info, - SugaredBuilder, CanonicalBuilder, PartialOrdering, + S, Template, /*IsDeduced*/ PartialOrdering, Deduced, Info, + SugaredBuilder, CanonicalBuilder, /*CurrentInstantiationScope=*/nullptr, - /*NumAlreadyConverted=*/0U, /*IsIncomplete=*/nullptr); + /*NumAlreadyConverted=*/0U); Result != TemplateDeductionResult::Success) return Result; @@ -3454,9 +3451,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( S, TD, /*IsDeduced=*/false, Deduced, Info, SugaredBuilder, - CanonicalBuilder, /*PartialOrdering=*/false, - /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, - /*IsIncomplete=*/nullptr); + CanonicalBuilder); Result != TemplateDeductionResult::Success) return Result; @@ -3994,8 +3989,7 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( unsigned NumExplicitlySpecified, FunctionDecl *&Specialization, TemplateDeductionInfo &Info, SmallVectorImpl const *OriginalCallArgs, - bool PartialOverloading, bool PartialOrdering, - llvm::function_ref CheckNonDependent) { + bool PartialOverloading, llvm::function_ref CheckNonDependent) { // Unevaluated SFINAE context. EnterExpressionEvaluationContext Unevaluated( *this, Sema::ExpressionEvaluationContext::Unevaluated); @@ -4018,10 +4012,9 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction( bool IsIncomplete = false; SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( - *this, FunctionTemplate, /*IsDeduced=*/true, Deduced, Info, - SugaredBuilder, CanonicalBuilder, PartialOrdering, - CurrentInstantiationScope, NumExplicitlySpecified, - PartialOverloading ? &IsIncomplete : nullptr); + *this, FunctionTemplate, /*IsDeduced*/ true, Deduced, Info, + SugaredBuilder, CanonicalBuilder, CurrentInstantiationScope, + NumExplicitlySpecified, PartialOverloading ? &IsIncomplete : nullptr); Result != TemplateDeductionResult::Success) return Result; @@ -4553,8 +4546,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( TemplateArgumentListInfo *ExplicitTemplateArgs, ArrayRef Args, FunctionDecl *&Specialization, TemplateDeductionInfo &Info, bool PartialOverloading, bool AggregateDeductionCandidate, - bool PartialOrdering, QualType ObjectType, - Expr::Classification ObjectClassification, + QualType ObjectType, Expr::Classification ObjectClassification, llvm::function_ref)> CheckNonDependent) { if (FunctionTemplate->isInvalidDecl()) return TemplateDeductionResult::Invalid; @@ -4769,8 +4761,7 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( runWithSufficientStackSpace(Info.getLocation(), [&] { Result = FinishTemplateArgumentDeduction( FunctionTemplate, Deduced, NumExplicitlySpecified, Specialization, Info, - &OriginalCallArgs, PartialOverloading, PartialOrdering, - [&, CallingCtx]() { + &OriginalCallArgs, PartialOverloading, [&, CallingCtx]() { ContextRAII SavedContext(*this, CallingCtx); return CheckNonDependent(ParamTypesForArgChecking); }); @@ -4882,10 +4873,9 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( TemplateDeductionResult Result; runWithSufficientStackSpace(Info.getLocation(), [&] { - Result = FinishTemplateArgumentDeduction( - FunctionTemplate, Deduced, NumExplicitlySpecified, Specialization, Info, - /*OriginalCallArgs=*/nullptr, /*PartialOverloading=*/false, - /*PartialOrdering=*/true); + Result = FinishTemplateArgumentDeduction(FunctionTemplate, Deduced, + NumExplicitlySpecified, + Specialization, Info); }); if (Result != TemplateDeductionResult::Success) return Result; @@ -5065,10 +5055,9 @@ TemplateDeductionResult Sema::DeduceTemplateArguments( FunctionDecl *ConversionSpecialized = nullptr; TemplateDeductionResult Result; runWithSufficientStackSpace(Info.getLocation(), [&] { - Result = FinishTemplateArgumentDeduction( - ConversionTemplate, Deduced, 0, ConversionSpecialized, Info, - &OriginalCallArgs, /*PartialOverloading=*/false, - /*PartialOrdering=*/false); + Result = FinishTemplateArgumentDeduction(ConversionTemplate, Deduced, 0, + ConversionSpecialized, Info, + &OriginalCallArgs); }); Specialization = cast_or_null(ConversionSpecialized); return Result; @@ -5645,8 +5634,7 @@ static TemplateDeductionResult FinishTemplateArgumentDeduction( SmallVector SugaredBuilder, CanonicalBuilder; if (auto Result = ConvertDeducedTemplateArguments( S, FTD, /*IsDeduced=*/true, Deduced, Info, SugaredBuilder, - CanonicalBuilder, /*PartialOrdering=*/true, - /*CurrentInstantiationScope=*/nullptr, + CanonicalBuilder, /*CurrentInstantiationScope=*/nullptr, /*NumAlreadyConverted=*/0, &IsIncomplete); Result != TemplateDeductionResult::Success) return Result; @@ -6491,8 +6479,8 @@ bool Sema::isMoreSpecializedThanPrimary( bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs( TemplateParameterList *P, TemplateDecl *PArg, TemplateDecl *AArg, - const DefaultArguments &DefaultArgs, SourceLocation ArgLoc, - bool PartialOrdering, bool *MatchedPackOnParmToNonPackOnArg) { + const DefaultArguments &DefaultArgs, SourceLocation ArgLoc, bool IsDeduced, + bool *MatchedPackOnParmToNonPackOnArg) { // C++1z [temp.arg.template]p4: (DR 150) // A template template-parameter P is at least as specialized as a // template template-argument A if, given the following rewrite to two @@ -6571,7 +6559,7 @@ bool Sema::isTemplateTemplateParameterAtLeastAsSpecializedAs( switch (::DeduceTemplateArguments( *this, A, AArgs, PArgs, Info, Deduced, /*NumberOfArgumentsMustMatch=*/false, /*PartialOrdering=*/true, - PartialOrdering ? PackFold::ArgumentToParameter : PackFold::Both, + IsDeduced ? PackFold::ArgumentToParameter : PackFold::Both, /*HasDeducedAnyParam=*/nullptr)) { case clang::TemplateDeductionResult::Success: if (MatchedPackOnParmToNonPackOnArg && diff --git a/clang/test/SemaTemplate/cwg2398.cpp b/clang/test/SemaTemplate/cwg2398.cpp index 3825239de4a2..56091e84cf4e 100644 --- a/clang/test/SemaTemplate/cwg2398.cpp +++ b/clang/test/SemaTemplate/cwg2398.cpp @@ -405,87 +405,6 @@ namespace packs { } // namespace t4 } // namespace packs -namespace fun_tmpl_call { - namespace match_func { - template