Skip to content
  1. Oct 01, 2005
  2. Sep 29, 2005
  3. Sep 28, 2005
  4. Sep 27, 2005
    • Chris Lattner's avatar
      Avoid spilling stack slots... to stack slots. · e285f5ed
      Chris Lattner authored
      llvm-svn: 23478
      e285f5ed
    • Chris Lattner's avatar
      Completely rewrite 'correct' eh support. This changes how setjmp insertion · 87eb2493
      Chris Lattner authored
      is performed so it is only at most once per function that contains an invoke
      instead of once per invoke in the function.  This patch has the following perks:
      
      1. It fixes PR631, which complains about slowness.
      2. If fixes PR240, which complains about non-volatile vars being live across
         setjmp/longjmps.
      3. It improves (but does not fix) the jmpbuf alignment issue on itanium by not
         forcing the jmpbufs to always be 8-bytes off the alignment of the structure.
      4. It speeds up 253.perlbmk from 338s to 13.70s (a 25x improvement!), making us
         now about 4% faster than GCC.
      
      Further improvements are also possible.
      
      llvm-svn: 23477
      87eb2493
    • Chris Lattner's avatar
      Make the pass name simpler · 92233d21
      Chris Lattner authored
      llvm-svn: 23476
      92233d21
  5. Sep 26, 2005
  6. Sep 25, 2005
  7. Sep 18, 2005
    • Chris Lattner's avatar
      Refactor this code a bit and make it more general. This now compiles: · b4b2530a
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus2 (unsigned int x) { b.j += x; }
      
      To:
      
      _plus2:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              slwi r3, r3, 6
              add r3, r4, r3
              rlwimi r3, r4, 0, 26, 14
              stw r3, 0(r2)
              blr
      
      
      instead of:
      
      _plus2:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              rlwinm r5, r4, 26, 21, 31
              add r3, r5, r3
              rlwimi r4, r3, 6, 15, 25
              stw r4, 0(r2)
              blr
      
      by eliminating an 'and'.
      
      I'm pretty sure this is as small as we can go :)
      
      llvm-svn: 23386
      b4b2530a
    • Chris Lattner's avatar
      Compile · 797dee77
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus2 (unsigned int x) {
        b.j += x;
      }
      
      to:
      
      plus2:
              mov %EAX, DWORD PTR [b]
              mov %ECX, %EAX
              and %ECX, 131008
              mov %EDX, DWORD PTR [%ESP + 4]
              shl %EDX, 6
              add %EDX, %ECX
              and %EDX, 131008
              and %EAX, -131009
              or %EDX, %EAX
              mov DWORD PTR [b], %EDX
              ret
      
      instead of:
      
      plus2:
              mov %EAX, DWORD PTR [b]
              mov %ECX, %EAX
              shr %ECX, 6
              and %ECX, 2047
              add %ECX, DWORD PTR [%ESP + 4]
              shl %ECX, 6
              and %ECX, 131008
              and %EAX, -131009
              or %ECX, %EAX
              mov DWORD PTR [b], %ECX
              ret
      
      llvm-svn: 23385
      797dee77
    • Chris Lattner's avatar
      Generalize this transform, using MaskedValueIsZero, allowing us to compile: · 01f56c68
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus3 (unsigned int x) { b.k += x; }
      
      To:
      
      plus3:
              mov %EAX, DWORD PTR [%ESP + 4]
              shl %EAX, 17
              add DWORD PTR [b], %EAX
              ret
      
      instead of:
      
      plus3:
              mov %EAX, DWORD PTR [%ESP + 4]
              shl %EAX, 17
              mov %ECX, DWORD PTR [b]
              add %EAX, %ECX
              and %EAX, -131072
              and %ECX, 131071
              or %ECX, %EAX
              mov DWORD PTR [b], %ECX
              ret
      
      llvm-svn: 23384
      01f56c68
    • Chris Lattner's avatar
      fix typeo · 4ebc8ab4
      Chris Lattner authored
      llvm-svn: 23383
      4ebc8ab4
    • Chris Lattner's avatar
      Remove unintentionally committed code · e5b23a6d
      Chris Lattner authored
      llvm-svn: 23382
      e5b23a6d
    • Chris Lattner's avatar
      implement shift.ll:test25. This compiles: · 27cb9dbd
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus3 (unsigned int x) {
        b.k += x;
      }
      
      to:
      
      _plus3:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r3, 0(r2)
              rlwinm r4, r3, 0, 0, 14
              add r4, r4, r3
              rlwimi r4, r3, 0, 15, 31
              stw r4, 0(r2)
              blr
      
      instead of:
      
      _plus3:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              srwi r5, r4, 17
              add r3, r5, r3
              slwi r3, r3, 17
              rlwimi r3, r4, 0, 15, 31
              stw r3, 0(r2)
              blr
      
      llvm-svn: 23381
      27cb9dbd
    • Chris Lattner's avatar
      Implement add.ll:test29. Codegening: · af517574
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus1 (unsigned int x) {
        b.i += x;
      }
      
      as:
      _plus1:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              add r3, r4, r3
              rlwimi r3, r4, 0, 0, 25
              stw r3, 0(r2)
              blr
      
      instead of:
      
      _plus1:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              rlwinm r5, r4, 0, 26, 31
              add r3, r5, r3
              rlwimi r3, r4, 0, 0, 25
              stw r3, 0(r2)
              blr
      
      llvm-svn: 23379
      af517574
    • Chris Lattner's avatar
      remove debug output · 027eaf01
      Chris Lattner authored
      llvm-svn: 23377
      027eaf01
    • Chris Lattner's avatar
      Implement or.ll:test21. This teaches instcombine to be able to turn this: · 15212989
      Chris Lattner authored
      struct {
         unsigned int bit0:1;
         unsigned int ubyte:31;
      } sdata;
      
      void foo() {
        sdata.ubyte++;
      }
      
      into this:
      
      foo:
              add DWORD PTR [sdata], 2
              ret
      
      instead of this:
      
      foo:
              mov %EAX, DWORD PTR [sdata]
              mov %ECX, %EAX
              add %ECX, 2
              and %ECX, -2
              and %EAX, 1
              or %EAX, %ECX
              mov DWORD PTR [sdata], %EAX
              ret
      
      llvm-svn: 23376
      15212989
  8. Sep 14, 2005
  9. Sep 13, 2005
  10. Sep 12, 2005
    • Chris Lattner's avatar
      Fix a regression from last night, which caused this pass to create invalid · 8048b85e
      Chris Lattner authored
      code for IV uses outside of loops that are not dominated by the latch block.
      We should only convert these uses to use the post-inc value if they ARE
      dominated by the latch block.
      
      Also use a new LoopInfo method to simplify some code.
      
      This fixes Transforms/LoopStrengthReduce/2005-09-12-UsesOutOutsideOfLoop.ll
      
      llvm-svn: 23318
      8048b85e
    • Chris Lattner's avatar
      _test: · a6764839
      Chris Lattner authored
              li r2, 0
      LBB_test_1:     ; no_exit.2
              li r5, 0
              stw r5, 0(r3)
              addi r2, r2, 1
              addi r3, r3, 4
              cmpwi cr0, r2, 701
              blt cr0, LBB_test_1     ; no_exit.2
      LBB_test_2:     ; loopexit.2.loopexit
              addi r2, r2, 1
              stw r2, 0(r4)
              blr
      [zion ~/llvm]$ cat > ~/xx
      Uses of IV's outside of the loop should use hte post-incremented version
      of the IV, not the preincremented version.  This helps many loops (e.g. in sixtrack)
      which used to generate code like this (this is the code from the
      dont-hoist-simple-loop-constants.ll testcase):
      
      _test:
              li r2, 0                 **** IV starts at 0
      LBB_test_1:     ; no_exit.2
              or r5, r2, r2            **** Copy for loop exit
              li r2, 0
              stw r2, 0(r3)
              addi r3, r3, 4
              addi r2, r5, 1
              addi r6, r5, 2           **** IV+2
              cmpwi cr0, r6, 701
              blt cr0, LBB_test_1     ; no_exit.2
      LBB_test_2:     ; loopexit.2.loopexit
              addi r2, r5, 2       ****  IV+2
              stw r2, 0(r4)
              blr
      
      And now generated code like this:
      
      _test:
              li r2, 1               *** IV starts at 1
      LBB_test_1:     ; no_exit.2
              li r5, 0
              stw r5, 0(r3)
              addi r2, r2, 1
              addi r3, r3, 4
              cmpwi cr0, r2, 701     *** IV.postinc + 0
              blt cr0, LBB_test_1
      LBB_test_2:     ; loopexit.2.loopexit
              stw r2, 0(r4)          *** IV.postinc + 0
              blr
      
      llvm-svn: 23313
      a6764839
  11. Sep 10, 2005
    • Chris Lattner's avatar
      implement Transforms/LoopStrengthReduce/dont-hoist-simple-loop-constants.ll. · 530fe6ab
      Chris Lattner authored
      We used to emit this code for it:
      
      _test:
              li r2, 1     ;; Value tying up a register for the whole loop
              li r5, 0
      LBB_test_1:     ; no_exit.2
              or r6, r5, r5
              li r5, 0
              stw r5, 0(r3)
              addi r5, r6, 1
              addi r3, r3, 4
              add r7, r2, r5  ;; should be addi r7, r5, 1
              cmpwi cr0, r7, 701
              blt cr0, LBB_test_1     ; no_exit.2
      LBB_test_2:     ; loopexit.2.loopexit
              addi r2, r6, 2
              stw r2, 0(r4)
              blr
      
      now we emit this:
      
      _test:
              li r2, 0
      LBB_test_1:     ; no_exit.2
              or r5, r2, r2
              li r2, 0
              stw r2, 0(r3)
              addi r3, r3, 4
              addi r2, r5, 1
              addi r6, r5, 2   ;; whoa, fold those adds!
              cmpwi cr0, r6, 701
              blt cr0, LBB_test_1     ; no_exit.2
      LBB_test_2:     ; loopexit.2.loopexit
              addi r2, r5, 2
              stw r2, 0(r4)
              blr
      
      more improvement coming.
      
      llvm-svn: 23306
      530fe6ab
  12. Sep 02, 2005
  13. Aug 24, 2005
  14. Aug 17, 2005
  15. Aug 16, 2005
  16. Aug 13, 2005
    • Chris Lattner's avatar
      Ooops, don't forget to clear this. The real inner loop is now: · 47d3ec35
      Chris Lattner authored
      .LBB_foo_3:     ; no_exit.1
              lfd f2, 0(r9)
              lfd f3, 8(r9)
              fmul f4, f1, f2
              fmadd f4, f0, f3, f4
              stfd f4, 8(r9)
              fmul f3, f1, f3
              fmsub f2, f0, f2, f3
              stfd f2, 0(r9)
              addi r9, r9, 16
              addi r8, r8, 1
              cmpw cr0, r8, r4
              ble .LBB_foo_3  ; no_exit.1
      
      llvm-svn: 22782
      47d3ec35
    • Chris Lattner's avatar
      Recursively scan scev expressions for common subexpressions. This allows us · 5949d490
      Chris Lattner authored
      to handle nested loops much better, for example, by being able to tell that
      these two expressions:
      
      {( 8 + ( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp 12)}<loopentry.1>
      
      {(( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp12)}<loopentry.1>
      
      Have the following common part that can be shared:
      {(( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp12)}<loopentry.1>
      
      This allows us to codegen an important inner loop in 168.wupwise as:
      
      .LBB_foo_4:     ; no_exit.1
              lfd f2, 16(r9)
              fmul f3, f0, f2
              fmul f2, f1, f2
              fadd f4, f3, f2
              stfd f4, 8(r9)
              fsub f2, f3, f2
              stfd f2, 16(r9)
              addi r8, r8, 1
              addi r9, r9, 16
              cmpw cr0, r8, r4
              ble .LBB_foo_4  ; no_exit.1
      
      instead of:
      
      .LBB_foo_3:     ; no_exit.1
              lfdx f2, r6, r9
              add r10, r6, r9
              lfd f3, 8(r10)
              fmul f4, f1, f2
              fmadd f4, f0, f3, f4
              stfd f4, 8(r10)
              fmul f3, f1, f3
              fmsub f2, f0, f2, f3
              stfdx f2, r6, r9
              addi r9, r9, 16
              addi r8, r8, 1
              cmpw cr0, r8, r4
              ble .LBB_foo_3  ; no_exit.1
      
      llvm-svn: 22781
      5949d490
    • Chris Lattner's avatar
      remove dead code. The exit block list is computed on demand, thus does not · 79396539
      Chris Lattner authored
      need to be updated.  This code is a relic from when it did.
      
      llvm-svn: 22775
      79396539
    • Chris Lattner's avatar
      When splitting critical edges, make sure not to leave the new block in the · 8447b495
      Chris Lattner authored
      middle of the loop.  This turns a critical loop in gzip into this:
      
      .LBB_test_1:    ; loopentry
              or r27, r28, r28
              add r28, r3, r27
              lhz r28, 3(r28)
              add r26, r4, r27
              lhz r26, 3(r26)
              cmpw cr0, r28, r26
              bne .LBB_test_8 ; loopentry.loopexit_crit_edge
      .LBB_test_2:    ; shortcirc_next.0
              add r28, r3, r27
              lhz r28, 5(r28)
              add r26, r4, r27
              lhz r26, 5(r26)
              cmpw cr0, r28, r26
              bne .LBB_test_7 ; shortcirc_next.0.loopexit_crit_edge
      .LBB_test_3:    ; shortcirc_next.1
              add r28, r3, r27
              lhz r28, 7(r28)
              add r26, r4, r27
              lhz r26, 7(r26)
              cmpw cr0, r28, r26
              bne .LBB_test_6 ; shortcirc_next.1.loopexit_crit_edge
      .LBB_test_4:    ; shortcirc_next.2
              add r28, r3, r27
              lhz r26, 9(r28)
              add r28, r4, r27
              lhz r25, 9(r28)
              addi r28, r27, 8
              cmpw cr7, r26, r25
              mfcr r26, 1
              rlwinm r26, r26, 31, 31, 31
              add r25, r8, r27
              cmpw cr7, r25, r7
              mfcr r25, 1
              rlwinm r25, r25, 29, 31, 31
              and. r26, r26, r25
              bne .LBB_test_1 ; loopentry
      
      instead of this:
      
      .LBB_test_1:    ; loopentry
              or r27, r28, r28
              add r28, r3, r27
              lhz r28, 3(r28)
              add r26, r4, r27
              lhz r26, 3(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_3 ; shortcirc_next.0
      .LBB_test_2:    ; loopentry.loopexit_crit_edge
              add r2, r30, r27
              add r8, r29, r27
              b .LBB_test_9   ; loopexit
      .LBB_test_3:    ; shortcirc_next.0
              add r28, r3, r27
              lhz r28, 5(r28)
              add r26, r4, r27
              lhz r26, 5(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_5 ; shortcirc_next.1
      .LBB_test_4:    ; shortcirc_next.0.loopexit_crit_edge
              add r2, r11, r27
              add r8, r12, r27
              b .LBB_test_9   ; loopexit
      .LBB_test_5:    ; shortcirc_next.1
              add r28, r3, r27
              lhz r28, 7(r28)
              add r26, r4, r27
              lhz r26, 7(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_7 ; shortcirc_next.2
      .LBB_test_6:    ; shortcirc_next.1.loopexit_crit_edge
              add r2, r9, r27
              add r8, r10, r27
              b .LBB_test_9   ; loopexit
      .LBB_test_7:    ; shortcirc_next.2
              add r28, r3, r27
              lhz r26, 9(r28)
              add r28, r4, r27
              lhz r25, 9(r28)
              addi r28, r27, 8
              cmpw cr7, r26, r25
              mfcr r26, 1
              rlwinm r26, r26, 31, 31, 31
              add r25, r8, r27
              cmpw cr7, r25, r7
              mfcr r25, 1
              rlwinm r25, r25, 29, 31, 31
              and. r26, r26, r25
              bne .LBB_test_1 ; loopentry
      
      Next up, improve the code for the loop.
      
      llvm-svn: 22769
      8447b495
    • Chris Lattner's avatar
      Fix a FIXME: if we are inserting code for a PHI argument, split the critical · 4fec86d3
      Chris Lattner authored
      edge so that the code is not always executed for both operands.  This
      prevents LSR from inserting code into loops whose exit blocks contain
      PHI uses of IV expressions (which are outside of loops).  On gzip, for
      example, we turn this ugly code:
      
      .LBB_test_1:    ; loopentry
              add r27, r3, r28
              lhz r27, 3(r27)
              add r26, r4, r28
              lhz r26, 3(r26)
              add r25, r30, r28    ;; Only live if exiting the loop
              add r24, r29, r28    ;; Only live if exiting the loop
              cmpw cr0, r27, r26
              bne .LBB_test_5 ; loopexit
      
      into this:
      
      .LBB_test_1:    ; loopentry
              or r27, r28, r28
              add r28, r3, r27
              lhz r28, 3(r28)
              add r26, r4, r27
              lhz r26, 3(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_3 ; shortcirc_next.0
      .LBB_test_2:    ; loopentry.loopexit_crit_edge
              add r2, r30, r27
              add r8, r29, r27
              b .LBB_test_9   ; loopexit
      .LBB_test_2:    ; shortcirc_next.0
              ...
              blt .LBB_test_1
      
      
      into this:
      
      .LBB_test_1:    ; loopentry
              or r27, r28, r28
              add r28, r3, r27
              lhz r28, 3(r28)
              add r26, r4, r27
              lhz r26, 3(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_3 ; shortcirc_next.0
      .LBB_test_2:    ; loopentry.loopexit_crit_edge
              add r2, r30, r27
              add r8, r29, r27
              b .LBB_t_3:    ; shortcirc_next.0
      .LBB_test_3:    ; shortcirc_next.0
              ...
              blt .LBB_test_1
      
      
      Next step: get the block out of the loop so that the loop is all
      fall-throughs again.
      
      llvm-svn: 22766
      4fec86d3
Loading