Skip to content
  • Nate Begeman's avatar
    Generate better code for v8i16 shuffles on SSE2 · e684da3e
    Nate Begeman authored
    Generate better code for v16i8 shuffles on SSE2 (avoids stack)
    Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops.
    Document the shuffle matching logic and add some FIXMEs for later further
      cleanups.
    New tests that test the above.
    
    Examples:
    
    New:
    _shuf2:
    	pextrw	$7, %xmm0, %eax
    	punpcklqdq	%xmm1, %xmm0
    	pshuflw	$128, %xmm0, %xmm0
    	pinsrw	$2, %eax, %xmm0
    
    Old:
    _shuf2:
    	pextrw	$2, %xmm0, %eax
    	pextrw	$7, %xmm0, %ecx
    	pinsrw	$2, %ecx, %xmm0
    	pinsrw	$3, %eax, %xmm0
    	movd	%xmm1, %eax
    	pinsrw	$4, %eax, %xmm0
    	ret
    
    =========
    
    New:
    _shuf4:
    	punpcklqdq	%xmm1, %xmm0
    	pshufb	LCPI1_0, %xmm0
    
    Old:
    _shuf4:
    	pextrw	$3, %xmm0, %eax
    	movsd	%xmm1, %xmm0
    	pextrw	$3, %xmm1, %ecx
    	pinsrw	$4, %ecx, %xmm0
    	pinsrw	$5, %eax, %xmm0
    
    ========
    
    New:
    _shuf1:
    	pushl	%ebx
    	pushl	%edi
    	pushl	%esi
    	pextrw	$1, %xmm0, %eax
    	rolw	$8, %ax
    	movd	%xmm0, %ecx
    	rolw	$8, %cx
    	pextrw	$5, %xmm0, %edx
    	pextrw	$4, %xmm0, %esi
    	pextrw	$3, %xmm0, %edi
    	pextrw	$2, %xmm0, %ebx
    	movaps	%xmm0, %xmm1
    	pinsrw	$0, %ecx, %xmm1
    	pinsrw	$1, %eax, %xmm1
    	rolw	$8, %bx
    	pinsrw	$2, %ebx, %xmm1
    	rolw	$8, %di
    	pinsrw	$3, %edi, %xmm1
    	rolw	$8, %si
    	pinsrw	$4, %esi, %xmm1
    	rolw	$8, %dx
    	pinsrw	$5, %edx, %xmm1
    	pextrw	$7, %xmm0, %eax
    	rolw	$8, %ax
    	movaps	%xmm1, %xmm0
    	pinsrw	$7, %eax, %xmm0
    	popl	%esi
    	popl	%edi
    	popl	%ebx
    	ret
    
    Old:
    _shuf1:
    	subl	$252, %esp
    	movaps	%xmm0, (%esp)
    	movaps	%xmm0, 16(%esp)
    	movaps	%xmm0, 32(%esp)
    	movaps	%xmm0, 48(%esp)
    	movaps	%xmm0, 64(%esp)
    	movaps	%xmm0, 80(%esp)
    	movaps	%xmm0, 96(%esp)
    	movaps	%xmm0, 224(%esp)
    	movaps	%xmm0, 208(%esp)
    	movaps	%xmm0, 192(%esp)
    	movaps	%xmm0, 176(%esp)
    	movaps	%xmm0, 160(%esp)
    	movaps	%xmm0, 144(%esp)
    	movaps	%xmm0, 128(%esp)
    	movaps	%xmm0, 112(%esp)
    	movzbl	14(%esp), %eax
    	movd	%eax, %xmm1
    	movzbl	22(%esp), %eax
    	movd	%eax, %xmm2
    	punpcklbw	%xmm1, %xmm2
    	movzbl	42(%esp), %eax
    	movd	%eax, %xmm1
    	movzbl	50(%esp), %eax
    	movd	%eax, %xmm3
    	punpcklbw	%xmm1, %xmm3
    	punpcklbw	%xmm2, %xmm3
    	movzbl	77(%esp), %eax
    	movd	%eax, %xmm1
    	movzbl	84(%esp), %eax
    	movd	%eax, %xmm2
    	punpcklbw	%xmm1, %xmm2
    	movzbl	104(%esp), %eax
    	movd	%eax, %xmm1
    	punpcklbw	%xmm1, %xmm0
    	punpcklbw	%xmm2, %xmm0
    	movaps	%xmm0, %xmm1
    	punpcklbw	%xmm3, %xmm1
    	movzbl	127(%esp), %eax
    	movd	%eax, %xmm0
    	movzbl	135(%esp), %eax
    	movd	%eax, %xmm2
    	punpcklbw	%xmm0, %xmm2
    	movzbl	155(%esp), %eax
    	movd	%eax, %xmm0
    	movzbl	163(%esp), %eax
    	movd	%eax, %xmm3
    	punpcklbw	%xmm0, %xmm3
    	punpcklbw	%xmm2, %xmm3
    	movzbl	188(%esp), %eax
    	movd	%eax, %xmm0
    	movzbl	197(%esp), %eax
    	movd	%eax, %xmm2
    	punpcklbw	%xmm0, %xmm2
    	movzbl	217(%esp), %eax
    	movd	%eax, %xmm4
    	movzbl	225(%esp), %eax
    	movd	%eax, %xmm0
    	punpcklbw	%xmm4, %xmm0
    	punpcklbw	%xmm2, %xmm0
    	punpcklbw	%xmm3, %xmm0
    	punpcklbw	%xmm1, %xmm0
    	addl	$252, %esp
    	ret
    
    llvm-svn: 65311
    e684da3e
Loading