From a3bdd8f27b0991387fd03568379a942b01cddfcd Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 10 Mar 2017 05:25:49 +0000 Subject: [PATCH] AMDGPU: Fix insertion point when reducing load intrinsics The insertion point may be later than the next instruction, so it is necessary to set it when replacing the call. llvm-svn: 297439 --- .../InstCombineSimplifyDemanded.cpp | 3 ++ .../amdgcn-demanded-vector-elts.ll | 38 +++++++++++++++++++ 2 files changed, 41 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 78ee9d5de3fd..843ca7fedfc6 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1584,6 +1584,9 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I) Args.push_back(II->getArgOperand(I)); + IRBuilderBase::InsertPointGuard Guard(*Builder); + Builder->SetInsertPoint(II); + CallInst *NewCall = Builder->CreateCall(NewIntrin, Args); NewCall->takeName(II); NewCall->copyMetadata(*II); diff --git a/llvm/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll index 642f537d1103..888f51bf939d 100644 --- a/llvm/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll +++ b/llvm/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll @@ -266,6 +266,44 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i3 ret <2 x float> %shuf } +; The initial insertion point is at the extractelement +; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) +; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> undef, <4 x i32> +; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double> +; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0 +; CHECK-NEXT: ret double %tmp2 +define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { + %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %tmp1 = bitcast <4 x float> %tmp to <2 x double> + %tmp2 = extractelement <2 x double> %tmp1, i32 0 + ret double %tmp2 +} + +; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) +; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32 +; CHECK-NEXT: ret i32 %tmp2 +define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { + %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %tmp1 = bitcast <4 x float> %tmp to <4 x i32> + %tmp2 = extractelement <4 x i32> %tmp1, i32 0 + ret i32 %tmp2 +} + +; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32( +; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) +; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %tmp, i64 0 +; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <8 x i16> +; CHECK-NEXT: %tmp2 = extractelement <8 x i16> %tmp1, i32 0 +; CHECK-NEXT: ret i16 %tmp2 +define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 { + %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3 + %tmp1 = bitcast <4 x float> %tmp to <8 x i16> + %tmp2 = extractelement <8 x i16> %tmp1, i32 0 + ret i16 %tmp2 +} + declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1 declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -- GitLab