Skip to content
  • Nate Begeman's avatar
    ~40% faster vector shl <4 x i32> on SSE 4.1 Larger improvements for smaller... · 269a6da0
    Nate Begeman authored
    ~40% faster vector shl <4 x i32> on SSE 4.1  Larger improvements for smaller types coming in future patches.
    
    For:
    
    define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
    entry:
      %shl = shl <4 x i32> %r, %a                     ; <<4 x i32>> [#uses=1]
      %tmp2 = bitcast <4 x i32> %shl to <2 x i64>     ; <<2 x i64>> [#uses=1]
      ret <2 x i64> %tmp2
    }
    
    We get:
    
    _shl:                                   ## @shl
    	pslld	$23, %xmm1
    	paddd	LCPI0_0, %xmm1
    	cvttps2dq	%xmm1, %xmm1
    	pmulld	%xmm1, %xmm0
    	ret
    
    Instead of:
    
    _shl:                                   ## @shl
    	pshufd	$3, %xmm0, %xmm2
    	movd	%xmm2, %eax
    	pshufd	$3, %xmm1, %xmm2
    	movd	%xmm2, %ecx
    	shll	%cl, %eax
    	movd	%eax, %xmm2
    	pshufd	$1, %xmm0, %xmm3
    	movd	%xmm3, %eax
    	pshufd	$1, %xmm1, %xmm3
    	movd	%xmm3, %ecx
    	shll	%cl, %eax
    	movd	%eax, %xmm3
    	punpckldq	%xmm2, %xmm3
    	movd	%xmm0, %eax
    	movd	%xmm1, %ecx
    	shll	%cl, %eax
    	movd	%eax, %xmm2
    	movhlps	%xmm0, %xmm0
    	movd	%xmm0, %eax
    	movhlps	%xmm1, %xmm1
    	movd	%xmm1, %ecx
    	shll	%cl, %eax
    	movd	%eax, %xmm0
    	punpckldq	%xmm0, %xmm2
    	movdqa	%xmm2, %xmm0
    	punpckldq	%xmm3, %xmm0
    	ret
    
    llvm-svn: 109549
    269a6da0
Loading