Commits · 827c985aae446865328d6fe9d4e40a8b89de9d77 · Lorenzo Albano / LLVM bpEVL

Apr 06, 2004

Improve codegen of long == and != comparisons against constants. Before, · f2ee88eb

Chris Lattner authored Apr 06, 2004

comparing a long against zero got us this:

        sub %ESP, 8
        mov DWORD PTR [%ESP + 4], %ESI
        mov DWORD PTR [%ESP], %EDI
        mov %EAX, DWORD PTR [%ESP + 12]
        mov %EDX, DWORD PTR [%ESP + 16]
        mov %ECX, 0
        mov %ESI, 0
        mov %EDI, %EAX
        xor %EDI, %ECX
        mov %ECX, %EDX
        xor %ECX, %ESI
        or %EDI, %ECX
        sete %CL
        test %CL, %CL
        je .LBB2 # PC rel: F

Now it gets us this:

        mov %EAX, DWORD PTR [%ESP + 4]
        mov %EDX, DWORD PTR [%ESP + 8]
        mov %ECX, %EAX
        or %ECX, %EDX
        sete %CL
        test %CL, %CL
        je .LBB2 # PC rel: F

llvm-svn: 12696

f2ee88eb

Handle various other important cases of multiplying a long constant immediate. For · 6c3bf13f

Chris Lattner authored Apr 06, 2004

example, multiplying X*(1 + (1LL << 32)) now produces:

test:
        mov %ECX, DWORD PTR [%ESP + 4]
        mov %EDX, DWORD PTR [%ESP + 8]
        mov %EAX, %ECX
        add %EDX, %ECX
        ret

[[[Note to Alkis: why isn't linear scan generating this code??  This might be a
 problem with your intervals being too conservative:

test:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %EDX, DWORD PTR [%ESP + 8]
        add %EDX, %EAX
        ret

end note]]]

Whereas GCC produces this:

T:
        sub     %esp, 12
        mov     %edx, DWORD PTR [%esp+16]
        mov     DWORD PTR [%esp+8], %edi
        mov     %ecx, DWORD PTR [%esp+20]
        xor     %edi, %edi
        mov     DWORD PTR [%esp], %ebx
        mov     %ebx, %edi
        mov     %eax, %edx
        mov     DWORD PTR [%esp+4], %esi
        add     %ebx, %edx
        mov     %edi, DWORD PTR [%esp+8]
        lea     %edx, [%ecx+%ebx]
        mov     %esi, DWORD PTR [%esp+4]
        mov     %ebx, DWORD PTR [%esp]
        add     %esp, 12
        ret

I'm not sure example what GCC is smoking here, but it looks like it has just
confused itself with a bunch of stack slots or something.  The intel compiler
is better, but still not good:

T:
        movl      4(%esp), %edx                                 #2.11
        movl      8(%esp), %eax                                 #2.11
        lea       (%eax,%edx), %ecx                             #3.12
        movl      $1, %eax                                      #3.12
        mull      %edx                                          #3.12
        addl      %ecx, %edx                                    #3.12
        ret                                                     #3.12

llvm-svn: 12693

6c3bf13f

Efficiently handle a long multiplication by a constant. For this testcase: · 1f6024cb

Chris Lattner authored Apr 06, 2004

long %test(long %X) {
        %Y = mul long %X, 123
        ret long %Y
}

we used to generate:

test:
        sub %ESP, 12
        mov DWORD PTR [%ESP + 8], %ESI
        mov DWORD PTR [%ESP + 4], %EDI
        mov DWORD PTR [%ESP], %EBX
        mov %ECX, DWORD PTR [%ESP + 16]
        mov %ESI, DWORD PTR [%ESP + 20]
        mov %EDI, 123
        mov %EBX, 0
        mov %EAX, %ECX
        mul %EDI
        imul %ESI, %EDI
        add %ESI, %EDX
        imul %ECX, %EBX
        add %ESI, %ECX
        mov %EDX, %ESI
        mov %EBX, DWORD PTR [%ESP]
        mov %EDI, DWORD PTR [%ESP + 4]
        mov %ESI, DWORD PTR [%ESP + 8]
        add %ESP, 12
        ret

Now we emit:
test:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %ECX, DWORD PTR [%ESP + 8]
        mov %EDX, 123
        mul %EDX
        imul %ECX, %ECX, 123
        add %ECX, %EDX
        mov %EDX, %ECX
        ret

Which, incidently, is substantially nicer than what GCC manages:
T:
        sub     %esp, 8
        mov     %eax, 123
        mov     DWORD PTR [%esp], %ebx
        mov     %ebx, DWORD PTR [%esp+16]
        mov     DWORD PTR [%esp+4], %esi
        mov     %esi, DWORD PTR [%esp+12]
        imul    %ecx, %ebx, 123
        mov     %ebx, DWORD PTR [%esp]
        mul     %esi
        mov     %esi, DWORD PTR [%esp+4]
        add     %esp, 8
        lea     %edx, [%ecx+%edx]
        ret

llvm-svn: 12692

1f6024cb

Improve code generation of long shifts by 32. · 2448baea

Chris Lattner authored Apr 06, 2004

On this testcase:

long %test(long %X) {
        %Y = shr long %X, ubyte 32
        ret long %Y
}

instead of:
t:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %EAX, DWORD PTR [%ESP + 8]
        sar %EAX, 0
        mov %EDX, 0
        ret


we now emit:
test:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %EAX, DWORD PTR [%ESP + 8]
        mov %EDX, 0
        ret

llvm-svn: 12688

2448baea

Bugfixes: inc/dec don't set the carry flag! · 7332d4c5
Chris Lattner authored Apr 06, 2004
```
llvm-svn: 12687
```
7332d4c5

Improve code for passing constant longs as arguments to function calls. · decce5bc

Chris Lattner authored Apr 06, 2004

For example, on this instruction:

        call void %test(long 1234)

Instead of this:
        mov %EAX, 1234
        mov %ECX, 0
        mov DWORD PTR [%ESP], %EAX
        mov DWORD PTR [%ESP + 4], %ECX
        call test

We now emit this:
        mov DWORD PTR [%ESP], 1234
        mov DWORD PTR [%ESP + 4], 0
        call test

llvm-svn: 12686

decce5bc

Emit more efficient 64-bit operations when the RHS is a constant, and one · 5fc6f77b

Chris Lattner authored Apr 06, 2004

of the words of the constant is zeros.  For example:
  Y = and long X, 1234

now generates:
  Yl = and Xl, 1234
  Yh = 0

instead of:
  Yl = and Xl, 1234
  Yh = and Xh, 0

llvm-svn: 12685

5fc6f77b

Fix typeo · b49608af
Chris Lattner authored Apr 06, 2004
```
llvm-svn: 12684
```
b49608af
Add support for simple immediate handling to long instruction selection. · 996e667a
Chris Lattner authored Apr 06, 2004
```
This allows us to handle code like 'add long %X, 123456789012' more efficiently.

llvm-svn: 12683
```
996e667a

Implement negation of longs efficiently. For this testcase: · 37ba31f7

Chris Lattner authored Apr 06, 2004

long %test(long %X) {
        %Y = sub long 0, %X
        ret long %Y
}

We used to generate:

test:
        sub %ESP, 4
        mov DWORD PTR [%ESP], %ESI
        mov %ECX, DWORD PTR [%ESP + 8]
        mov %ESI, DWORD PTR [%ESP + 12]
        mov %EAX, 0
        mov %EDX, 0
        sub %EAX, %ECX
        sbb %EDX, %ESI
        mov %ESI, DWORD PTR [%ESP]
        add %ESP, 4
        ret

Now we generate:

test:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %EDX, DWORD PTR [%ESP + 8]
        neg %EAX
        adc %EDX, 0
        neg %EDX
        ret

llvm-svn: 12681

37ba31f7

Minor tweak to avoid an extra reg-reg copy that the register allocator has to eliminate · bfe74f58
Chris Lattner authored Apr 06, 2004
```
llvm-svn: 12680
```
bfe74f58

Two changes: · 464e2ea5

Chris Lattner authored Apr 06, 2004

  * In promote32, if we can just promote a constant value, do so instead of
    promoting a constant dynamically.
  * In visitReturn inst, actually USE the promote32 argument that takes a
    Value*

The end result of this is that we now generate this:

test:
        mov %EAX, 0
        ret

instead of...

test:
        mov %AX, 0
        movzx %EAX, %AX
        ret

for:

ushort %test() {
        ret ushort 0
}

llvm-svn: 12679

464e2ea5

Apr 05, 2004
- Support getelementptr instructions which use uint's to index into structure · 69193f93
  Chris Lattner authored Apr 05, 2004
```
types and can have arbitrary 32- and 64-bit integer types indexing into
sequential types.

llvm-svn: 12653
```
  69193f93
Apr 02, 2004
- Clean up code a bit. · d64e904e
  Alkis Evlogimenos authored Apr 02, 2004
```
llvm-svn: 12615
```
  d64e904e
- Fix type in instruction builder instantiation · 5fc4772d
  Alkis Evlogimenos authored Apr 02, 2004
```
llvm-svn: 12610
```
  5fc4772d
Apr 01, 2004
- Generate slightly smaller code, "test R, R" instead of "cmp R, 0" · d55509c2
  Chris Lattner authored Mar 31, 2004
```
llvm-svn: 12579
```
  d55509c2
- Codegen FP select instructions into X86 conditional moves. Annoyingly enough · 37a7f09d
  Chris Lattner authored Mar 31, 2004
```
the X86 does not support a full set of fp cmove instructions, so we can't always
fold the condition into the select.  :(  Yuck.

llvm-svn: 12577
```
  37a7f09d
Mar 31, 2004
- Fold comparisons into select instructions, making much better code and · 32817f59
  Chris Lattner authored Mar 30, 2004
```
using our broad selection of movcc instructions.  :)

llvm-svn: 12560
```
  32817f59
Mar 30, 2004

Add direct support for integer select instructions, though we still don't support · 53b58cb8
Chris Lattner authored Mar 30, 2004
```
folding compares into the select yet.

llvm-svn: 12553
```
53b58cb8

Fix a fairly major performance problem. If a PHI node had a constant as · 0048e574

Chris Lattner authored Mar 30, 2004

an incoming value from a block, the selector would evaluate the constant
at the TOP of the block instead of at the end of the block.  This made the
live range for the constant span the entire block, increasing register
pressure needlessly.

llvm-svn: 12542

0048e574

Mar 18, 2004
- Malloc doesn't kill a load. This patch need not go into 1.2 though. · 6ca9b89a
  Chris Lattner authored Mar 18, 2004
```
llvm-svn: 12500
```
  6ca9b89a
- Fix a really nasty bug that was breaking ijpeg in LLC mode. We were incorrectly · dc47e271
  Chris Lattner authored Mar 18, 2004
```
folding load instructions into other instructions across free instruction
boundaries.  Perhaps this will also fix the other strange failures?

llvm-svn: 12494
```
  dc47e271
Mar 13, 2004
- It helps if I save the file. :) · 699aa70f
  Chris Lattner authored Mar 13, 2004
```
llvm-svn: 12357
```
  699aa70f
- Rename the intrinsic enum values for llvm.va_* from Intrinsic::va_* to · 071a5e56
  Chris Lattner authored Mar 13, 2004
```
Intrinsic::va*.  This avoid conflicting with macros in the stdlib.h file.

llvm-svn: 12356
```
  071a5e56
Mar 08, 2004

Implement folding explicit load instructions into binary operations. For a · 653e662a

Chris Lattner authored Mar 08, 2004

testcase like this:

int %test(int* %P, int %A) {
        %Pv = load int* %P
        %B = add int %A, %Pv
        ret int %B
}

We now generate:
test:
        mov %ECX, DWORD PTR [%ESP + 4]
        mov %EAX, DWORD PTR [%ESP + 8]
        add %EAX, DWORD PTR [%ECX]
        ret

Instead of:
test:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %ECX, DWORD PTR [%ESP + 8]
        mov %EAX, DWORD PTR [%EAX]
        add %EAX, %ECX
        ret

... saving one instruction, and often a register.  Note that there are a lot
of other instructions that could use this, but they aren't handled.  I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.

llvm-svn: 12204

653e662a

Rearrange and refactor some code. No functionality changes. · 1dd6afe6
Chris Lattner authored Mar 08, 2004
```
llvm-svn: 12203
```
1dd6afe6

Mar 02, 2004
- Doxygenify some comments. · a6025e64
  Misha Brukman authored Mar 01, 2004
```
llvm-svn: 12064
```
  a6025e64
Mar 01, 2004

Handle passing constant integers to functions much more efficiently. Instead · 1f4642c4

Chris Lattner authored Mar 01, 2004

of generating this code:

        mov %EAX, 4
        mov DWORD PTR [%ESP], %EAX
        mov %AX, 123
        movsx %EAX, %AX
        mov DWORD PTR [%ESP + 4], %EAX
        call Y

we now generate:
        mov DWORD PTR [%ESP], 4
        mov DWORD PTR [%ESP + 4], 123
        call Y

Which hurts the eyes less.  :)

Considering that register pressure around call sites is already high (with all
of the callee clobber registers n stuff), this may help a lot.

llvm-svn: 12028

1f4642c4

Fix a minor code-quality issue. When passing 8 and 16-bit integer constants · 5c7d3cda

Chris Lattner authored Mar 01, 2004

to function calls, we would emit dead code, like this:

int Y(int, short, double);
int X() {
  Y(4, 123, 4);
}

--- Old
X:
        sub %ESP, 20
        mov %EAX, 4
        mov DWORD PTR [%ESP], %EAX
***     mov %AX, 123
        mov %AX, 123
        movsx %EAX, %AX
        mov DWORD PTR [%ESP + 4], %EAX
        fld QWORD PTR [.CPIX_0]
        fstp QWORD PTR [%ESP + 8]
        call Y
        mov %EAX, 0
        # IMPLICIT_USE %EAX %ESP
        add %ESP, 20
        ret

Now we emit:
X:
        sub %ESP, 20
        mov %EAX, 4
        mov DWORD PTR [%ESP], %EAX
        mov %AX, 123
        movsx %EAX, %AX
        mov DWORD PTR [%ESP + 4], %EAX
        fld QWORD PTR [.CPIX_0]
        fstp QWORD PTR [%ESP + 8]
        call Y
        mov %EAX, 0
        # IMPLICIT_USE %EAX %ESP
        add %ESP, 20
        ret

Next up, eliminate the mov AX and movsx entirely!

llvm-svn: 12026

5c7d3cda

Feb 29, 2004
- A big X86 instruction rename. The instructions are renamed to make · ea81b79a
  Alkis Evlogimenos authored Feb 29, 2004
```
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:

ADD8rr -> add, 8-bit register, 8-bit register

IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate

IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate

MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory

llvm-svn: 11995
```
  ea81b79a
- Eliminate the X86-specific BMI functions, using BuildMI instead. · 1e36fb03
  Chris Lattner authored Feb 29, 2004
```
Replace uses of addZImm with addImm.

llvm-svn: 11992
```
  1e36fb03
- Fix a miscompilation of 197.parser that occurs when you have single basic · 9a975732
  Chris Lattner authored Feb 29, 2004
```
block loops.

llvm-svn: 11990
```
  9a975732
- These two virtual methods are never called. · ca89812d
  Chris Lattner authored Feb 29, 2004
```
llvm-svn: 11984
```
  ca89812d
- SHLD and SHRD take 32-bit operands but an 8-bit immediate. Rename them · fa635805
  Alkis Evlogimenos authored Feb 28, 2004
```
to denote this fact.

llvm-svn: 11972
```
  fa635805
- Floating point loads/stores act on memory operands. Rename them to · 4953ae08
  Alkis Evlogimenos authored Feb 28, 2004
```
denote this fact.

llvm-svn: 11971
```
  4953ae08
Feb 27, 2004
- Rename SHL, SHR, SAR, SHLD and SHLR instructions to make them · f020dfb4
  Alkis Evlogimenos authored Feb 27, 2004
```
consistent with the rest and also pepare for the addition of their
memory operand variants.

llvm-svn: 11902
```
  f020dfb4
Feb 26, 2004
- Uncomment assertions that register# != 0 on calls to · 61719d48
  Alkis Evlogimenos authored Feb 26, 2004
```
MRegisterInfo::is{Physical,Virtual}Register. Apply appropriate fixes
to relevant files.

llvm-svn: 11882
```
  61719d48
- Fix some warnings, some of which were spurious, and some of which were real · 9192bbda
  Chris Lattner authored Feb 26, 2004
```
bugs.  Thanks Brian!

llvm-svn: 11859
```
  9192bbda
Feb 25, 2004

Teach the instruction selector how to transform 'array' GEP computations into X86 · 309327a4

Chris Lattner authored Feb 25, 2004

scaled indexes.  This allows us to compile GEP's like this:

int* %test([10 x { int, { int } }]* %X, int %Idx) {
        %Idx = cast int %Idx to long
        %X = getelementptr [10 x { int, { int } }]* %X, long 0, long %Idx, ubyte 1, ubyte 0
        ret int* %X
}

Into a single address computation:

test:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %ECX, DWORD PTR [%ESP + 8]
        lea %EAX, DWORD PTR [%EAX + 8*%ECX + 4]
        ret

Before it generated:
test:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %ECX, DWORD PTR [%ESP + 8]
        shl %ECX, 3
        add %EAX, %ECX
        lea %EAX, DWORD PTR [%EAX + 4]
        ret

This is useful for things like int/float/double arrays, as the indexing can be folded into
the loads&stores, reducing register pressure and decreasing the pressure on the decode unit.
With these changes, I expect our performance on 256.bzip2 and gzip to improve a lot.  On
bzip2 for example, we go from this:

10665 asm-printer           - Number of machine instrs printed
   40 ra-local              - Number of loads/stores folded into instructions
 1708 ra-local              - Number of loads added
 1532 ra-local              - Number of stores added
 1354 twoaddressinstruction - Number of instructions added
 1354 twoaddressinstruction - Number of two-address instructions
 2794 x86-peephole          - Number of peephole optimization performed

to this:
9873 asm-printer           - Number of machine instrs printed
  41 ra-local              - Number of loads/stores folded into instructions
1710 ra-local              - Number of loads added
1521 ra-local              - Number of stores added
 789 twoaddressinstruction - Number of instructions added
 789 twoaddressinstruction - Number of two-address instructions
2142 x86-peephole          - Number of peephole optimization performed

... and these types of instructions are often in tight loops.

Linear scan is also helped, but not as much.  It goes from:

8787 asm-printer           - Number of machine instrs printed
2389 liveintervals         - Number of identity moves eliminated after coalescing
2288 liveintervals         - Number of interval joins performed
3522 liveintervals         - Number of intervals after coalescing
5810 liveintervals         - Number of original intervals
 700 spiller               - Number of loads added
 487 spiller               - Number of stores added
 303 spiller               - Number of register spills
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
 363 x86-peephole          - Number of peephole optimization performed

to:

7982 asm-printer           - Number of machine instrs printed
1759 liveintervals         - Number of identity moves eliminated after coalescing
1658 liveintervals         - Number of interval joins performed
3282 liveintervals         - Number of intervals after coalescing
4940 liveintervals         - Number of original intervals
 635 spiller               - Number of loads added
 452 spiller               - Number of stores added
 288 spiller               - Number of register spills
 789 twoaddressinstruction - Number of instructions added
 789 twoaddressinstruction - Number of two-address instructions
 258 x86-peephole          - Number of peephole optimization performed

Though I'm not complaining about the drop in the number of intervals.  :)

llvm-svn: 11820

309327a4

* Make the previous patch more efficient by not allocating a temporary MachineInstr · d1ee55d4

Chris Lattner authored Feb 25, 2004

  to do analysis.

*** FOLD getelementptr instructions into loads and stores when possible,
    making use of some of the crazy X86 addressing modes.

For example, the following C++ program fragment:

struct complex {
    double re, im;
    complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
    return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
    return arg + complex(1,0);
}

Used to be compiled to:
_Z6addoneRK7complex:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %ECX, DWORD PTR [%ESP + 8]
***     mov %EDX, %ECX
        fld QWORD PTR [%EDX]
        fld1
        faddp %ST(1)
***     add %ECX, 8
        fld QWORD PTR [%ECX]
        fldz
        faddp %ST(1)
***     mov %ECX, %EAX
        fxch %ST(1)
        fstp QWORD PTR [%ECX]
***     add %EAX, 8
        fstp QWORD PTR [%EAX]
        ret

Now it is compiled to:
_Z6addoneRK7complex:
        mov %EAX, DWORD PTR [%ESP + 4]
        mov %ECX, DWORD PTR [%ESP + 8]
        fld QWORD PTR [%ECX]
        fld1
        faddp %ST(1)
        fld QWORD PTR [%ECX + 8]
        fldz
        faddp %ST(1)
        fxch %ST(1)
        fstp QWORD PTR [%EAX]
        fstp QWORD PTR [%EAX + 8]
        ret

Other programs should see similar improvements, across the board.  Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86.  :)

llvm-svn: 11819

d1ee55d4