Skip to content
  1. Apr 06, 2004
    • Chris Lattner's avatar
      Improve codegen of long == and != comparisons against constants. Before, · f2ee88eb
      Chris Lattner authored
      comparing a long against zero got us this:
      
              sub %ESP, 8
              mov DWORD PTR [%ESP + 4], %ESI
              mov DWORD PTR [%ESP], %EDI
              mov %EAX, DWORD PTR [%ESP + 12]
              mov %EDX, DWORD PTR [%ESP + 16]
              mov %ECX, 0
              mov %ESI, 0
              mov %EDI, %EAX
              xor %EDI, %ECX
              mov %ECX, %EDX
              xor %ECX, %ESI
              or %EDI, %ECX
              sete %CL
              test %CL, %CL
              je .LBB2 # PC rel: F
      
      Now it gets us this:
      
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %EDX, DWORD PTR [%ESP + 8]
              mov %ECX, %EAX
              or %ECX, %EDX
              sete %CL
              test %CL, %CL
              je .LBB2 # PC rel: F
      
      llvm-svn: 12696
      f2ee88eb
    • Chris Lattner's avatar
      Handle various other important cases of multiplying a long constant immediate. For · 6c3bf13f
      Chris Lattner authored
      example, multiplying X*(1 + (1LL << 32)) now produces:
      
      test:
              mov %ECX, DWORD PTR [%ESP + 4]
              mov %EDX, DWORD PTR [%ESP + 8]
              mov %EAX, %ECX
              add %EDX, %ECX
              ret
      
      [[[Note to Alkis: why isn't linear scan generating this code??  This might be a
       problem with your intervals being too conservative:
      
      test:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %EDX, DWORD PTR [%ESP + 8]
              add %EDX, %EAX
              ret
      
      end note]]]
      
      Whereas GCC produces this:
      
      T:
              sub     %esp, 12
              mov     %edx, DWORD PTR [%esp+16]
              mov     DWORD PTR [%esp+8], %edi
              mov     %ecx, DWORD PTR [%esp+20]
              xor     %edi, %edi
              mov     DWORD PTR [%esp], %ebx
              mov     %ebx, %edi
              mov     %eax, %edx
              mov     DWORD PTR [%esp+4], %esi
              add     %ebx, %edx
              mov     %edi, DWORD PTR [%esp+8]
              lea     %edx, [%ecx+%ebx]
              mov     %esi, DWORD PTR [%esp+4]
              mov     %ebx, DWORD PTR [%esp]
              add     %esp, 12
              ret
      
      I'm not sure example what GCC is smoking here, but it looks like it has just
      confused itself with a bunch of stack slots or something.  The intel compiler
      is better, but still not good:
      
      T:
              movl      4(%esp), %edx                                 #2.11
              movl      8(%esp), %eax                                 #2.11
              lea       (%eax,%edx), %ecx                             #3.12
              movl      $1, %eax                                      #3.12
              mull      %edx                                          #3.12
              addl      %ecx, %edx                                    #3.12
              ret                                                     #3.12
      
      llvm-svn: 12693
      6c3bf13f
    • Chris Lattner's avatar
      Efficiently handle a long multiplication by a constant. For this testcase: · 1f6024cb
      Chris Lattner authored
      long %test(long %X) {
              %Y = mul long %X, 123
              ret long %Y
      }
      
      we used to generate:
      
      test:
              sub %ESP, 12
              mov DWORD PTR [%ESP + 8], %ESI
              mov DWORD PTR [%ESP + 4], %EDI
              mov DWORD PTR [%ESP], %EBX
              mov %ECX, DWORD PTR [%ESP + 16]
              mov %ESI, DWORD PTR [%ESP + 20]
              mov %EDI, 123
              mov %EBX, 0
              mov %EAX, %ECX
              mul %EDI
              imul %ESI, %EDI
              add %ESI, %EDX
              imul %ECX, %EBX
              add %ESI, %ECX
              mov %EDX, %ESI
              mov %EBX, DWORD PTR [%ESP]
              mov %EDI, DWORD PTR [%ESP + 4]
              mov %ESI, DWORD PTR [%ESP + 8]
              add %ESP, 12
              ret
      
      Now we emit:
      test:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %ECX, DWORD PTR [%ESP + 8]
              mov %EDX, 123
              mul %EDX
              imul %ECX, %ECX, 123
              add %ECX, %EDX
              mov %EDX, %ECX
              ret
      
      Which, incidently, is substantially nicer than what GCC manages:
      T:
              sub     %esp, 8
              mov     %eax, 123
              mov     DWORD PTR [%esp], %ebx
              mov     %ebx, DWORD PTR [%esp+16]
              mov     DWORD PTR [%esp+4], %esi
              mov     %esi, DWORD PTR [%esp+12]
              imul    %ecx, %ebx, 123
              mov     %ebx, DWORD PTR [%esp]
              mul     %esi
              mov     %esi, DWORD PTR [%esp+4]
              add     %esp, 8
              lea     %edx, [%ecx+%edx]
              ret
      
      llvm-svn: 12692
      1f6024cb
    • Chris Lattner's avatar
      Improve code generation of long shifts by 32. · 2448baea
      Chris Lattner authored
      On this testcase:
      
      long %test(long %X) {
              %Y = shr long %X, ubyte 32
              ret long %Y
      }
      
      instead of:
      t:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %EAX, DWORD PTR [%ESP + 8]
              sar %EAX, 0
              mov %EDX, 0
              ret
      
      
      we now emit:
      test:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %EAX, DWORD PTR [%ESP + 8]
              mov %EDX, 0
              ret
      
      llvm-svn: 12688
      2448baea
    • Chris Lattner's avatar
      Bugfixes: inc/dec don't set the carry flag! · 7332d4c5
      Chris Lattner authored
      llvm-svn: 12687
      7332d4c5
    • Chris Lattner's avatar
      Improve code for passing constant longs as arguments to function calls. · decce5bc
      Chris Lattner authored
      For example, on this instruction:
      
              call void %test(long 1234)
      
      Instead of this:
              mov %EAX, 1234
              mov %ECX, 0
              mov DWORD PTR [%ESP], %EAX
              mov DWORD PTR [%ESP + 4], %ECX
              call test
      
      We now emit this:
              mov DWORD PTR [%ESP], 1234
              mov DWORD PTR [%ESP + 4], 0
              call test
      
      llvm-svn: 12686
      decce5bc
    • Chris Lattner's avatar
      Emit more efficient 64-bit operations when the RHS is a constant, and one · 5fc6f77b
      Chris Lattner authored
      of the words of the constant is zeros.  For example:
        Y = and long X, 1234
      
      now generates:
        Yl = and Xl, 1234
        Yh = 0
      
      instead of:
        Yl = and Xl, 1234
        Yh = and Xh, 0
      
      llvm-svn: 12685
      5fc6f77b
    • Chris Lattner's avatar
      Fix typeo · b49608af
      Chris Lattner authored
      llvm-svn: 12684
      b49608af
    • Chris Lattner's avatar
      Add support for simple immediate handling to long instruction selection. · 996e667a
      Chris Lattner authored
      This allows us to handle code like 'add long %X, 123456789012' more efficiently.
      
      llvm-svn: 12683
      996e667a
    • Chris Lattner's avatar
      Implement negation of longs efficiently. For this testcase: · 37ba31f7
      Chris Lattner authored
      long %test(long %X) {
              %Y = sub long 0, %X
              ret long %Y
      }
      
      We used to generate:
      
      test:
              sub %ESP, 4
              mov DWORD PTR [%ESP], %ESI
              mov %ECX, DWORD PTR [%ESP + 8]
              mov %ESI, DWORD PTR [%ESP + 12]
              mov %EAX, 0
              mov %EDX, 0
              sub %EAX, %ECX
              sbb %EDX, %ESI
              mov %ESI, DWORD PTR [%ESP]
              add %ESP, 4
              ret
      
      Now we generate:
      
      test:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %EDX, DWORD PTR [%ESP + 8]
              neg %EAX
              adc %EDX, 0
              neg %EDX
              ret
      
      llvm-svn: 12681
      37ba31f7
    • Chris Lattner's avatar
    • Chris Lattner's avatar
      Two changes: · 464e2ea5
      Chris Lattner authored
        * In promote32, if we can just promote a constant value, do so instead of
          promoting a constant dynamically.
        * In visitReturn inst, actually USE the promote32 argument that takes a
          Value*
      
      The end result of this is that we now generate this:
      
      test:
              mov %EAX, 0
              ret
      
      instead of...
      
      test:
              mov %AX, 0
              movzx %EAX, %AX
              ret
      
      for:
      
      ushort %test() {
              ret ushort 0
      }
      
      llvm-svn: 12679
      464e2ea5
  2. Apr 05, 2004
  3. Apr 02, 2004
  4. Apr 01, 2004
  5. Mar 31, 2004
  6. Mar 30, 2004
  7. Mar 18, 2004
  8. Mar 13, 2004
  9. Mar 08, 2004
    • Chris Lattner's avatar
      Implement folding explicit load instructions into binary operations. For a · 653e662a
      Chris Lattner authored
      testcase like this:
      
      int %test(int* %P, int %A) {
              %Pv = load int* %P
              %B = add int %A, %Pv
              ret int %B
      }
      
      We now generate:
      test:
              mov %ECX, DWORD PTR [%ESP + 4]
              mov %EAX, DWORD PTR [%ESP + 8]
              add %EAX, DWORD PTR [%ECX]
              ret
      
      Instead of:
      test:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %ECX, DWORD PTR [%ESP + 8]
              mov %EAX, DWORD PTR [%EAX]
              add %EAX, %ECX
              ret
      
      ... saving one instruction, and often a register.  Note that there are a lot
      of other instructions that could use this, but they aren't handled.  I'm not
      really interested in adding them, but mul/div and all of the FP instructions
      could be supported as well if someone wanted to add them.
      
      llvm-svn: 12204
      653e662a
    • Chris Lattner's avatar
      Rearrange and refactor some code. No functionality changes. · 1dd6afe6
      Chris Lattner authored
      llvm-svn: 12203
      1dd6afe6
  10. Mar 02, 2004
  11. Mar 01, 2004
    • Chris Lattner's avatar
      Handle passing constant integers to functions much more efficiently. Instead · 1f4642c4
      Chris Lattner authored
      of generating this code:
      
              mov %EAX, 4
              mov DWORD PTR [%ESP], %EAX
              mov %AX, 123
              movsx %EAX, %AX
              mov DWORD PTR [%ESP + 4], %EAX
              call Y
      
      we now generate:
              mov DWORD PTR [%ESP], 4
              mov DWORD PTR [%ESP + 4], 123
              call Y
      
      Which hurts the eyes less.  :)
      
      Considering that register pressure around call sites is already high (with all
      of the callee clobber registers n stuff), this may help a lot.
      
      llvm-svn: 12028
      1f4642c4
    • Chris Lattner's avatar
      Fix a minor code-quality issue. When passing 8 and 16-bit integer constants · 5c7d3cda
      Chris Lattner authored
      to function calls, we would emit dead code, like this:
      
      int Y(int, short, double);
      int X() {
        Y(4, 123, 4);
      }
      
      --- Old
      X:
              sub %ESP, 20
              mov %EAX, 4
              mov DWORD PTR [%ESP], %EAX
      ***     mov %AX, 123
              mov %AX, 123
              movsx %EAX, %AX
              mov DWORD PTR [%ESP + 4], %EAX
              fld QWORD PTR [.CPIX_0]
              fstp QWORD PTR [%ESP + 8]
              call Y
              mov %EAX, 0
              # IMPLICIT_USE %EAX %ESP
              add %ESP, 20
              ret
      
      Now we emit:
      X:
              sub %ESP, 20
              mov %EAX, 4
              mov DWORD PTR [%ESP], %EAX
              mov %AX, 123
              movsx %EAX, %AX
              mov DWORD PTR [%ESP + 4], %EAX
              fld QWORD PTR [.CPIX_0]
              fstp QWORD PTR [%ESP + 8]
              call Y
              mov %EAX, 0
              # IMPLICIT_USE %EAX %ESP
              add %ESP, 20
              ret
      
      Next up, eliminate the mov AX and movsx entirely!
      
      llvm-svn: 12026
      5c7d3cda
  12. Feb 29, 2004
  13. Feb 27, 2004
  14. Feb 26, 2004
  15. Feb 25, 2004
    • Chris Lattner's avatar
      Teach the instruction selector how to transform 'array' GEP computations into X86 · 309327a4
      Chris Lattner authored
      scaled indexes.  This allows us to compile GEP's like this:
      
      int* %test([10 x { int, { int } }]* %X, int %Idx) {
              %Idx = cast int %Idx to long
              %X = getelementptr [10 x { int, { int } }]* %X, long 0, long %Idx, ubyte 1, ubyte 0
              ret int* %X
      }
      
      Into a single address computation:
      
      test:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %ECX, DWORD PTR [%ESP + 8]
              lea %EAX, DWORD PTR [%EAX + 8*%ECX + 4]
              ret
      
      Before it generated:
      test:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %ECX, DWORD PTR [%ESP + 8]
              shl %ECX, 3
              add %EAX, %ECX
              lea %EAX, DWORD PTR [%EAX + 4]
              ret
      
      This is useful for things like int/float/double arrays, as the indexing can be folded into
      the loads&stores, reducing register pressure and decreasing the pressure on the decode unit.
      With these changes, I expect our performance on 256.bzip2 and gzip to improve a lot.  On
      bzip2 for example, we go from this:
      
      10665 asm-printer           - Number of machine instrs printed
         40 ra-local              - Number of loads/stores folded into instructions
       1708 ra-local              - Number of loads added
       1532 ra-local              - Number of stores added
       1354 twoaddressinstruction - Number of instructions added
       1354 twoaddressinstruction - Number of two-address instructions
       2794 x86-peephole          - Number of peephole optimization performed
      
      to this:
      9873 asm-printer           - Number of machine instrs printed
        41 ra-local              - Number of loads/stores folded into instructions
      1710 ra-local              - Number of loads added
      1521 ra-local              - Number of stores added
       789 twoaddressinstruction - Number of instructions added
       789 twoaddressinstruction - Number of two-address instructions
      2142 x86-peephole          - Number of peephole optimization performed
      
      ... and these types of instructions are often in tight loops.
      
      Linear scan is also helped, but not as much.  It goes from:
      
      8787 asm-printer           - Number of machine instrs printed
      2389 liveintervals         - Number of identity moves eliminated after coalescing
      2288 liveintervals         - Number of interval joins performed
      3522 liveintervals         - Number of intervals after coalescing
      5810 liveintervals         - Number of original intervals
       700 spiller               - Number of loads added
       487 spiller               - Number of stores added
       303 spiller               - Number of register spills
      1354 twoaddressinstruction - Number of instructions added
      1354 twoaddressinstruction - Number of two-address instructions
       363 x86-peephole          - Number of peephole optimization performed
      
      to:
      
      7982 asm-printer           - Number of machine instrs printed
      1759 liveintervals         - Number of identity moves eliminated after coalescing
      1658 liveintervals         - Number of interval joins performed
      3282 liveintervals         - Number of intervals after coalescing
      4940 liveintervals         - Number of original intervals
       635 spiller               - Number of loads added
       452 spiller               - Number of stores added
       288 spiller               - Number of register spills
       789 twoaddressinstruction - Number of instructions added
       789 twoaddressinstruction - Number of two-address instructions
       258 x86-peephole          - Number of peephole optimization performed
      
      Though I'm not complaining about the drop in the number of intervals.  :)
      
      llvm-svn: 11820
      309327a4
    • Chris Lattner's avatar
      * Make the previous patch more efficient by not allocating a temporary MachineInstr · d1ee55d4
      Chris Lattner authored
        to do analysis.
      
      *** FOLD getelementptr instructions into loads and stores when possible,
          making use of some of the crazy X86 addressing modes.
      
      For example, the following C++ program fragment:
      
      struct complex {
          double re, im;
          complex(double r, double i) : re(r), im(i) {}
      };
      inline complex operator+(const complex& a, const complex& b) {
          return complex(a.re+b.re, a.im+b.im);
      }
      complex addone(const complex& arg) {
          return arg + complex(1,0);
      }
      
      Used to be compiled to:
      _Z6addoneRK7complex:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %ECX, DWORD PTR [%ESP + 8]
      ***     mov %EDX, %ECX
              fld QWORD PTR [%EDX]
              fld1
              faddp %ST(1)
      ***     add %ECX, 8
              fld QWORD PTR [%ECX]
              fldz
              faddp %ST(1)
      ***     mov %ECX, %EAX
              fxch %ST(1)
              fstp QWORD PTR [%ECX]
      ***     add %EAX, 8
              fstp QWORD PTR [%EAX]
              ret
      
      Now it is compiled to:
      _Z6addoneRK7complex:
              mov %EAX, DWORD PTR [%ESP + 4]
              mov %ECX, DWORD PTR [%ESP + 8]
              fld QWORD PTR [%ECX]
              fld1
              faddp %ST(1)
              fld QWORD PTR [%ECX + 8]
              fldz
              faddp %ST(1)
              fxch %ST(1)
              fstp QWORD PTR [%EAX]
              fstp QWORD PTR [%EAX + 8]
              ret
      
      Other programs should see similar improvements, across the board.  Note that
      in addition to reducing instruction count, this also reduces register pressure
      a lot, always a good thing on X86.  :)
      
      llvm-svn: 11819
      d1ee55d4
Loading