Commits · e684da3e5d1b8ed2bfa4b0eacc713da2ee415fa5 · Roger Ferrer / llvm-epi-0.8

Feb 23, 2009

Generate better code for v8i16 shuffles on SSE2 · e684da3e

Nate Begeman authored Feb 23, 2009

Generate better code for v16i8 shuffles on SSE2 (avoids stack)
Generate pshufb for v8i16 and v16i8 shuffles on SSSE3 where it is fewer uops.
Document the shuffle matching logic and add some FIXMEs for later further
  cleanups.
New tests that test the above.

Examples:

New:
_shuf2:
	pextrw	$7, %xmm0, %eax
	punpcklqdq	%xmm1, %xmm0
	pshuflw	$128, %xmm0, %xmm0
	pinsrw	$2, %eax, %xmm0

Old:
_shuf2:
	pextrw	$2, %xmm0, %eax
	pextrw	$7, %xmm0, %ecx
	pinsrw	$2, %ecx, %xmm0
	pinsrw	$3, %eax, %xmm0
	movd	%xmm1, %eax
	pinsrw	$4, %eax, %xmm0
	ret

=========

New:
_shuf4:
	punpcklqdq	%xmm1, %xmm0
	pshufb	LCPI1_0, %xmm0

Old:
_shuf4:
	pextrw	$3, %xmm0, %eax
	movsd	%xmm1, %xmm0
	pextrw	$3, %xmm1, %ecx
	pinsrw	$4, %ecx, %xmm0
	pinsrw	$5, %eax, %xmm0

========

New:
_shuf1:
	pushl	%ebx
	pushl	%edi
	pushl	%esi
	pextrw	$1, %xmm0, %eax
	rolw	$8, %ax
	movd	%xmm0, %ecx
	rolw	$8, %cx
	pextrw	$5, %xmm0, %edx
	pextrw	$4, %xmm0, %esi
	pextrw	$3, %xmm0, %edi
	pextrw	$2, %xmm0, %ebx
	movaps	%xmm0, %xmm1
	pinsrw	$0, %ecx, %xmm1
	pinsrw	$1, %eax, %xmm1
	rolw	$8, %bx
	pinsrw	$2, %ebx, %xmm1
	rolw	$8, %di
	pinsrw	$3, %edi, %xmm1
	rolw	$8, %si
	pinsrw	$4, %esi, %xmm1
	rolw	$8, %dx
	pinsrw	$5, %edx, %xmm1
	pextrw	$7, %xmm0, %eax
	rolw	$8, %ax
	movaps	%xmm1, %xmm0
	pinsrw	$7, %eax, %xmm0
	popl	%esi
	popl	%edi
	popl	%ebx
	ret

Old:
_shuf1:
	subl	$252, %esp
	movaps	%xmm0, (%esp)
	movaps	%xmm0, 16(%esp)
	movaps	%xmm0, 32(%esp)
	movaps	%xmm0, 48(%esp)
	movaps	%xmm0, 64(%esp)
	movaps	%xmm0, 80(%esp)
	movaps	%xmm0, 96(%esp)
	movaps	%xmm0, 224(%esp)
	movaps	%xmm0, 208(%esp)
	movaps	%xmm0, 192(%esp)
	movaps	%xmm0, 176(%esp)
	movaps	%xmm0, 160(%esp)
	movaps	%xmm0, 144(%esp)
	movaps	%xmm0, 128(%esp)
	movaps	%xmm0, 112(%esp)
	movzbl	14(%esp), %eax
	movd	%eax, %xmm1
	movzbl	22(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm1, %xmm2
	movzbl	42(%esp), %eax
	movd	%eax, %xmm1
	movzbl	50(%esp), %eax
	movd	%eax, %xmm3
	punpcklbw	%xmm1, %xmm3
	punpcklbw	%xmm2, %xmm3
	movzbl	77(%esp), %eax
	movd	%eax, %xmm1
	movzbl	84(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm1, %xmm2
	movzbl	104(%esp), %eax
	movd	%eax, %xmm1
	punpcklbw	%xmm1, %xmm0
	punpcklbw	%xmm2, %xmm0
	movaps	%xmm0, %xmm1
	punpcklbw	%xmm3, %xmm1
	movzbl	127(%esp), %eax
	movd	%eax, %xmm0
	movzbl	135(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm0, %xmm2
	movzbl	155(%esp), %eax
	movd	%eax, %xmm0
	movzbl	163(%esp), %eax
	movd	%eax, %xmm3
	punpcklbw	%xmm0, %xmm3
	punpcklbw	%xmm2, %xmm3
	movzbl	188(%esp), %eax
	movd	%eax, %xmm0
	movzbl	197(%esp), %eax
	movd	%eax, %xmm2
	punpcklbw	%xmm0, %xmm2
	movzbl	217(%esp), %eax
	movd	%eax, %xmm4
	movzbl	225(%esp), %eax
	movd	%eax, %xmm0
	punpcklbw	%xmm4, %xmm0
	punpcklbw	%xmm2, %xmm0
	punpcklbw	%xmm3, %xmm0
	punpcklbw	%xmm1, %xmm0
	addl	$252, %esp
	ret

llvm-svn: 65311

e684da3e

Introduce the BuildVectorSDNode class that encapsulates the ISD::BUILD_VECTOR · 9d31aca6

Scott Michel authored Feb 22, 2009

instruction. The class also consolidates the code for detecting constant
splats that's shared across PowerPC and the CellSPU backends (and might be
useful for other backends.) Also introduces SelectionDAG::getBUID_VECTOR() for
generating new BUILD_VECTOR nodes.

llvm-svn: 65296

9d31aca6

Feb 22, 2009
- Be bug compatible with gcc by returning MMX values in RAX. · e4ffc030
  Evan Cheng authored Feb 22, 2009
```
llvm-svn: 65274
```
  e4ffc030
Feb 20, 2009
- Support return of MMX values in 64-bit mode. · 2a9bad5a
  Evan Cheng authored Feb 20, 2009
```
llvm-svn: 65152
```
  2a9bad5a
Feb 17, 2009

Remove trailing whitespace to reduce later commit patch noise. · cf0da6c5

Scott Michel authored Feb 17, 2009

(Note: Eventually, commits like this will be handled via a pre-commit hook that
 does this automagically, as well as expand tabs to spaces and look for 80-col
 violations.)

llvm-svn: 64827

cf0da6c5

Feb 13, 2009
- Teach x86 target -soft-float. · c2fde917
  Evan Cheng authored Feb 13, 2009
```
llvm-svn: 64496
```
  c2fde917
Feb 12, 2009

Arrange to print constants that match "n" and "i" constraints · 65577529

Dale Johannesen authored Feb 12, 2009

in inline asm as signed (what gcc does).  Add partial support
for x86-specific "e" and "Z" constraints, with appropriate
signedness for printing.

llvm-svn: 64400

65577529

Feb 07, 2009

Use getDebugLoc forwarder instead of getNode()->getDebugLoc. · 9c310711
Dale Johannesen authored Feb 07, 2009
```
No functional change.

llvm-svn: 64026
```
9c310711
Constify TargetInstrInfo::EmitInstrWithCustomInserter, allowing · 747e55bc
Dan Gohman authored Feb 07, 2009
```
ScheduleDAG's TLI member to use const.

llvm-svn: 64018
```
747e55bc

Get rid of the last non-DebugLoc versions of getNode! · 62fd95d6

Dale Johannesen authored Feb 07, 2009

Many targets build placeholder nodes for special operands, e.g.
GlobalBaseReg on X86 and PPC for the PIC base.  There's no
sensible way to associate debug info with these.  I've left
them built with getNode calls with explicit DebugLoc::getUnknownLoc operands. 
I'm not too happy about this but don't see a good improvement;
I considered adding a getPseudoOperand or something, but it
seems to me that'll just make it harder to read.

llvm-svn: 63992

62fd95d6

Remove more non-DebugLoc getNode variants. Use · 84935759

Dale Johannesen authored Feb 06, 2009

getCALLSEQ_{END,START} to permit passing no DebugLoc
there.  UNDEF doesn't logically have DebugLoc; add
getUNDEF to encapsulate this.

llvm-svn: 63978

84935759

Feb 06, 2009
- Remove more non-DebugLoc versions of getNode. · 400dc2e2
  Dale Johannesen authored Feb 06, 2009
```
llvm-svn: 63969
```
  400dc2e2
- Get rid of one more non-DebugLoc getNode and · 9f3f72f1
  Dale Johannesen authored Feb 06, 2009
```
its corresponding getTargetNode.  Lots of
caller changes.

llvm-svn: 63904
```
  9f3f72f1
Feb 04, 2009
- Remove non-DebugLoc versions of getLoad and getStore. · 021052a7
  Dale Johannesen authored Feb 04, 2009
```
Adjust the many callers of those versions.

llvm-svn: 63767
```
  021052a7
- Minor code cleanups; no functionality change. · 556d14d4
  Dan Gohman authored Feb 04, 2009
```
llvm-svn: 63740
```
  556d14d4
- Fixes a case where we generate an incorrect mask for pshfhw in the presence · 4379a795
  Mon P Wang authored Feb 04, 2009
```
of undefs and incorrectly determining if we have punpckldq.

llvm-svn: 63702
```
  4379a795
- Patch up omissions in DebugLoc propagation. · bbf13f54
  Dale Johannesen authored Feb 04, 2009
```
llvm-svn: 63693
```
  bbf13f54
Feb 03, 2009
- Add some DL propagation to places that didn't · abf66b83
  Dale Johannesen authored Feb 03, 2009
```
have it yet.  More coming.

llvm-svn: 63673
```
  abf66b83
- DebugLoc propagation. done with file. · 1eb1ef2c
  Dale Johannesen authored Feb 03, 2009
```
llvm-svn: 63656
```
  1eb1ef2c
- DebugLoc propagation. 2/3 through file. · 66e03e6f
  Dale Johannesen authored Feb 03, 2009
```
llvm-svn: 63650
```
  66e03e6f
Feb 02, 2009
- ADD / SUB / SMUL / UMUL with overflow second result top bits must be zero. · dc636c40
  Evan Cheng authored Feb 02, 2009
```
llvm-svn: 63509
```
  dc636c40
- Add comment. · 4988c597
  Evan Cheng authored Feb 02, 2009
```
llvm-svn: 63506
```
  4988c597
- Teach LowerBRCOND to recognize (xor (setcc x), 1). The xor inverts the... · 50e15bdf
  Evan Cheng authored Feb 02, 2009
```
Teach LowerBRCOND to recognize (xor (setcc x), 1). The xor inverts the condition. It's normally transformed by the dag combiner, unless the condition is set by a arithmetic op with overflow.

llvm-svn: 63505
```
  50e15bdf
Feb 01, 2009

Implement -mno-sse: if SSE is disabled on x86-64, don't store XMM on stack for · a2d1f35e
Torok Edwin authored Feb 01, 2009
```
var-args, and don't allow FP return values

llvm-svn: 63495
```
a2d1f35e

Fix PR3453 and probably a bunch of other potential · 3ed76886

Duncan Sands authored Feb 01, 2009

crashes or wrong code with codegen of large integers:
eliminate the legacy getIntegerVTBitMask and
getIntegerVTSignBit methods, which returned their
value as a uint64_t, so couldn't handle huge types.

llvm-svn: 63494

3ed76886

Jan 31, 2009
- Make LowerCallTo and LowerArguments take a DebugLoc · 555a375b
  Dale Johannesen authored Jan 30, 2009
```
argument.  Adjust all callers and overloaded versions.

llvm-svn: 63444
```
  555a375b
- Get rid of the non-DebugLoc-ified getNOT() method. · 8fb81f1b
  Bill Wendling authored Jan 30, 2009
```
llvm-svn: 63442
```
  8fb81f1b
Jan 30, 2009
- When PerformBuildVectorCombine, avoid creating a X86ISD::VZEXT_LOAD of · cbb20a6e
  Mon P Wang authored Jan 30, 2009
```
an illegal type.

llvm-svn: 63380
```
  cbb20a6e
Jan 29, 2009

Make x86's BT instruction matching more thorough, and add some · e58ab79f

Dan Gohman authored Jan 29, 2009

dagcombines that help it match in several more cases. Add
several more cases to test/CodeGen/X86/bt.ll. This doesn't
yet include matching for BT with an immediate operand, it
just covers more register+register cases.

llvm-svn: 63266

e58ab79f

Fixed lowering of v816 shuffles. · 9150f735
Mon P Wang authored Jan 28, 2009
```
llvm-svn: 63252
```
9150f735

Jan 28, 2009
- Add shuffle splat pattern for x86 sse shifts. · 5a685a52
  Mon P Wang authored Jan 28, 2009
```
llvm-svn: 63193
```
  5a685a52
Jan 26, 2009

Take the next steps in making SDUse more consistent with LLVM Use, and · 8e4ac9b7

Dan Gohman authored Jan 26, 2009

tidy up SDUse and related code.
 - Replace the operator= member functions with a set method, like
   LLVM Use has, and variants setInitial and setNode, which take
   care up updating use lists, like LLVM Use's does. This simplifies
   code that calls these functions.
 - getSDValue() is renamed to get(), as in LLVM Use, though most
   places can either use the implicit conversion to SDValue or the
   convenience functions instead.
 - Fix some more node vs. value terminology issues.

Also, eliminate the one remaining use of SDOperandPtr, and
SDOperandPtr itself.

llvm-svn: 62995

8e4ac9b7

De-identifying per sabre review · a2550a8e
Nate Begeman authored Jan 26, 2009
```
llvm-svn: 62988
```
a2550a8e
Support pattern matching various x86 sse shifts. · 8a51d8c8
Nate Begeman authored Jan 26, 2009
```
llvm-svn: 62979
```
8a51d8c8

Jan 22, 2009

Add SelectionDAG::getNOT method to construct bitwise NOT operations, · c5890050

Bob Wilson authored Jan 22, 2009

corresponding to the "not" and "vnot" PatFrags.  Use the new method
in some places where it seems appropriate.

llvm-svn: 62768

c5890050

Jan 19, 2009

Minor tweak to LowerUINT_TO_FP_i32. Bias (after scalar_to_vector) has two uses... · 8f367e53

Evan Cheng authored Jan 19, 2009

Minor tweak to LowerUINT_TO_FP_i32. Bias (after scalar_to_vector) has two uses so we should make it the second source operand of ISD::OR so 2-address pass won't have to be smart about commuting.

%reg1024<def> = MOVSDrm %reg0, 1, %reg0, <cp#0>, Mem:LD(8,8) [ConstantPool + 0]
%reg1025<def> = MOVSD2PDrr %reg1024
%reg1026<def> = MOVDI2PDIrm <fi#-1>, 1, %reg0, 0, Mem:LD(4,16) [FixedStack-1 + 0]
%reg1027<def> = ORPSrr %reg1025<kill>, %reg1026<kill>
%reg1028<def> = MOVPD2SDrr %reg1027<kill>
%reg1029<def> = SUBSDrr %reg1028<kill>, %reg1024<kill>
%reg1030<def> = CVTSD2SSrr %reg1029<kill>
MOVSSmr <fi#0>, 1, %reg0, 0, %reg1030<kill>, Mem:ST(4,4) [FixedStack0 + 0]
%reg1031<def> = LD_Fp32m80 <fi#0>, 1, %reg0, 0, Mem:LD(4,16) [FixedStack0 + 0]
RET %reg1031<kill>, %ST0<imp-use,kill>

The reason 2-addr pass isn't smart enough to commute the ORPSrr is because it can't look pass the MOVSD2PDrr instruction.

llvm-svn: 62505

8f367e53

Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't · 7e9ef4d7

Evan Cheng authored Jan 19, 2009

optimize it to a SINT_TO_FP when the sign bit is known zero. X86 isel should perform the optimization itself.

llvm-svn: 62504

7e9ef4d7

Jan 17, 2009

Extend thi · f9291cf4
Bill Wendling authored Jan 17, 2009
```
llvm-svn: 62415
```
f9291cf4
Temporarily revert my last change. It is causing a bootstrap failure. · dd40f268
Bill Wendling authored Jan 17, 2009
```
llvm-svn: 62405
```
dd40f268

Implement a special algorithm for converting uint_to_fp for i32 values on · 4d527590

Bill Wendling authored Jan 17, 2009

X86. This code:

void f() {
  uint32_t x;
  float y = (float)x;
}

used to be:

     movl     %eax, -8(%ebp)
     movl     [2^52 double], -4(%ebp)
     movsd    -8(%ebp), %xmm0
     subsd    [2^52 double], %xmm0
     cvtsd2ss %xmm0, %xmm0

Is now:

   movsd        [2^52 double], %xmm0
   movsd        %xmm0, %xmm1
   movd         %ecx, %xmm2
   orps         %xmm2, %xmm1
   subsd        %xmm0, %xmm1
   cvtsd2ss     %xmm1, %xmm0

This is faster on X86. Note that there's an extra load of %xmm0 into %xmm1. That
will be fixed in a later coalescer fix.

llvm-svn: 62404

4d527590