Skip to content
  1. Sep 26, 2005
  2. Sep 25, 2005
  3. Sep 18, 2005
    • Chris Lattner's avatar
      Refactor this code a bit and make it more general. This now compiles: · b4b2530a
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus2 (unsigned int x) { b.j += x; }
      
      To:
      
      _plus2:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              slwi r3, r3, 6
              add r3, r4, r3
              rlwimi r3, r4, 0, 26, 14
              stw r3, 0(r2)
              blr
      
      
      instead of:
      
      _plus2:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              rlwinm r5, r4, 26, 21, 31
              add r3, r5, r3
              rlwimi r4, r3, 6, 15, 25
              stw r4, 0(r2)
              blr
      
      by eliminating an 'and'.
      
      I'm pretty sure this is as small as we can go :)
      
      llvm-svn: 23386
      b4b2530a
    • Chris Lattner's avatar
      Compile · 797dee77
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus2 (unsigned int x) {
        b.j += x;
      }
      
      to:
      
      plus2:
              mov %EAX, DWORD PTR [b]
              mov %ECX, %EAX
              and %ECX, 131008
              mov %EDX, DWORD PTR [%ESP + 4]
              shl %EDX, 6
              add %EDX, %ECX
              and %EDX, 131008
              and %EAX, -131009
              or %EDX, %EAX
              mov DWORD PTR [b], %EDX
              ret
      
      instead of:
      
      plus2:
              mov %EAX, DWORD PTR [b]
              mov %ECX, %EAX
              shr %ECX, 6
              and %ECX, 2047
              add %ECX, DWORD PTR [%ESP + 4]
              shl %ECX, 6
              and %ECX, 131008
              and %EAX, -131009
              or %ECX, %EAX
              mov DWORD PTR [b], %ECX
              ret
      
      llvm-svn: 23385
      797dee77
    • Chris Lattner's avatar
      Generalize this transform, using MaskedValueIsZero, allowing us to compile: · 01f56c68
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus3 (unsigned int x) { b.k += x; }
      
      To:
      
      plus3:
              mov %EAX, DWORD PTR [%ESP + 4]
              shl %EAX, 17
              add DWORD PTR [b], %EAX
              ret
      
      instead of:
      
      plus3:
              mov %EAX, DWORD PTR [%ESP + 4]
              shl %EAX, 17
              mov %ECX, DWORD PTR [b]
              add %EAX, %ECX
              and %EAX, -131072
              and %ECX, 131071
              or %ECX, %EAX
              mov DWORD PTR [b], %ECX
              ret
      
      llvm-svn: 23384
      01f56c68
    • Chris Lattner's avatar
      fix typeo · 4ebc8ab4
      Chris Lattner authored
      llvm-svn: 23383
      4ebc8ab4
    • Chris Lattner's avatar
      Remove unintentionally committed code · e5b23a6d
      Chris Lattner authored
      llvm-svn: 23382
      e5b23a6d
    • Chris Lattner's avatar
      implement shift.ll:test25. This compiles: · 27cb9dbd
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus3 (unsigned int x) {
        b.k += x;
      }
      
      to:
      
      _plus3:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r3, 0(r2)
              rlwinm r4, r3, 0, 0, 14
              add r4, r4, r3
              rlwimi r4, r3, 0, 15, 31
              stw r4, 0(r2)
              blr
      
      instead of:
      
      _plus3:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              srwi r5, r4, 17
              add r3, r5, r3
              slwi r3, r3, 17
              rlwimi r3, r4, 0, 15, 31
              stw r3, 0(r2)
              blr
      
      llvm-svn: 23381
      27cb9dbd
    • Chris Lattner's avatar
      Implement add.ll:test29. Codegening: · af517574
      Chris Lattner authored
      struct S { unsigned int i : 6, j : 11, k : 15; } b;
      void plus1 (unsigned int x) {
        b.i += x;
      }
      
      as:
      _plus1:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              add r3, r4, r3
              rlwimi r3, r4, 0, 0, 25
              stw r3, 0(r2)
              blr
      
      instead of:
      
      _plus1:
              lis r2, ha16(L_b$non_lazy_ptr)
              lwz r2, lo16(L_b$non_lazy_ptr)(r2)
              lwz r4, 0(r2)
              rlwinm r5, r4, 0, 26, 31
              add r3, r5, r3
              rlwimi r3, r4, 0, 0, 25
              stw r3, 0(r2)
              blr
      
      llvm-svn: 23379
      af517574
    • Chris Lattner's avatar
      remove debug output · 027eaf01
      Chris Lattner authored
      llvm-svn: 23377
      027eaf01
    • Chris Lattner's avatar
      Implement or.ll:test21. This teaches instcombine to be able to turn this: · 15212989
      Chris Lattner authored
      struct {
         unsigned int bit0:1;
         unsigned int ubyte:31;
      } sdata;
      
      void foo() {
        sdata.ubyte++;
      }
      
      into this:
      
      foo:
              add DWORD PTR [sdata], 2
              ret
      
      instead of this:
      
      foo:
              mov %EAX, DWORD PTR [sdata]
              mov %ECX, %EAX
              add %ECX, 2
              and %ECX, -2
              and %EAX, 1
              or %EAX, %ECX
              mov DWORD PTR [sdata], %EAX
              ret
      
      llvm-svn: 23376
      15212989
  4. Sep 14, 2005
  5. Sep 13, 2005
  6. Sep 12, 2005
    • Chris Lattner's avatar
      Fix a regression from last night, which caused this pass to create invalid · 8048b85e
      Chris Lattner authored
      code for IV uses outside of loops that are not dominated by the latch block.
      We should only convert these uses to use the post-inc value if they ARE
      dominated by the latch block.
      
      Also use a new LoopInfo method to simplify some code.
      
      This fixes Transforms/LoopStrengthReduce/2005-09-12-UsesOutOutsideOfLoop.ll
      
      llvm-svn: 23318
      8048b85e
    • Chris Lattner's avatar
      _test: · a6764839
      Chris Lattner authored
              li r2, 0
      LBB_test_1:     ; no_exit.2
              li r5, 0
              stw r5, 0(r3)
              addi r2, r2, 1
              addi r3, r3, 4
              cmpwi cr0, r2, 701
              blt cr0, LBB_test_1     ; no_exit.2
      LBB_test_2:     ; loopexit.2.loopexit
              addi r2, r2, 1
              stw r2, 0(r4)
              blr
      [zion ~/llvm]$ cat > ~/xx
      Uses of IV's outside of the loop should use hte post-incremented version
      of the IV, not the preincremented version.  This helps many loops (e.g. in sixtrack)
      which used to generate code like this (this is the code from the
      dont-hoist-simple-loop-constants.ll testcase):
      
      _test:
              li r2, 0                 **** IV starts at 0
      LBB_test_1:     ; no_exit.2
              or r5, r2, r2            **** Copy for loop exit
              li r2, 0
              stw r2, 0(r3)
              addi r3, r3, 4
              addi r2, r5, 1
              addi r6, r5, 2           **** IV+2
              cmpwi cr0, r6, 701
              blt cr0, LBB_test_1     ; no_exit.2
      LBB_test_2:     ; loopexit.2.loopexit
              addi r2, r5, 2       ****  IV+2
              stw r2, 0(r4)
              blr
      
      And now generated code like this:
      
      _test:
              li r2, 1               *** IV starts at 1
      LBB_test_1:     ; no_exit.2
              li r5, 0
              stw r5, 0(r3)
              addi r2, r2, 1
              addi r3, r3, 4
              cmpwi cr0, r2, 701     *** IV.postinc + 0
              blt cr0, LBB_test_1
      LBB_test_2:     ; loopexit.2.loopexit
              stw r2, 0(r4)          *** IV.postinc + 0
              blr
      
      llvm-svn: 23313
      a6764839
  7. Sep 10, 2005
    • Chris Lattner's avatar
      implement Transforms/LoopStrengthReduce/dont-hoist-simple-loop-constants.ll. · 530fe6ab
      Chris Lattner authored
      We used to emit this code for it:
      
      _test:
              li r2, 1     ;; Value tying up a register for the whole loop
              li r5, 0
      LBB_test_1:     ; no_exit.2
              or r6, r5, r5
              li r5, 0
              stw r5, 0(r3)
              addi r5, r6, 1
              addi r3, r3, 4
              add r7, r2, r5  ;; should be addi r7, r5, 1
              cmpwi cr0, r7, 701
              blt cr0, LBB_test_1     ; no_exit.2
      LBB_test_2:     ; loopexit.2.loopexit
              addi r2, r6, 2
              stw r2, 0(r4)
              blr
      
      now we emit this:
      
      _test:
              li r2, 0
      LBB_test_1:     ; no_exit.2
              or r5, r2, r2
              li r2, 0
              stw r2, 0(r3)
              addi r3, r3, 4
              addi r2, r5, 1
              addi r6, r5, 2   ;; whoa, fold those adds!
              cmpwi cr0, r6, 701
              blt cr0, LBB_test_1     ; no_exit.2
      LBB_test_2:     ; loopexit.2.loopexit
              addi r2, r5, 2
              stw r2, 0(r4)
              blr
      
      more improvement coming.
      
      llvm-svn: 23306
      530fe6ab
  8. Sep 02, 2005
  9. Aug 24, 2005
  10. Aug 17, 2005
  11. Aug 16, 2005
  12. Aug 13, 2005
    • Chris Lattner's avatar
      Ooops, don't forget to clear this. The real inner loop is now: · 47d3ec35
      Chris Lattner authored
      .LBB_foo_3:     ; no_exit.1
              lfd f2, 0(r9)
              lfd f3, 8(r9)
              fmul f4, f1, f2
              fmadd f4, f0, f3, f4
              stfd f4, 8(r9)
              fmul f3, f1, f3
              fmsub f2, f0, f2, f3
              stfd f2, 0(r9)
              addi r9, r9, 16
              addi r8, r8, 1
              cmpw cr0, r8, r4
              ble .LBB_foo_3  ; no_exit.1
      
      llvm-svn: 22782
      47d3ec35
    • Chris Lattner's avatar
      Recursively scan scev expressions for common subexpressions. This allows us · 5949d490
      Chris Lattner authored
      to handle nested loops much better, for example, by being able to tell that
      these two expressions:
      
      {( 8 + ( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp 12)}<loopentry.1>
      
      {(( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp12)}<loopentry.1>
      
      Have the following common part that can be shared:
      {(( 16 * ( 1 +  %Tmp11 +  %Tmp12)) +  %c_),+,( 16 *  %Tmp12)}<loopentry.1>
      
      This allows us to codegen an important inner loop in 168.wupwise as:
      
      .LBB_foo_4:     ; no_exit.1
              lfd f2, 16(r9)
              fmul f3, f0, f2
              fmul f2, f1, f2
              fadd f4, f3, f2
              stfd f4, 8(r9)
              fsub f2, f3, f2
              stfd f2, 16(r9)
              addi r8, r8, 1
              addi r9, r9, 16
              cmpw cr0, r8, r4
              ble .LBB_foo_4  ; no_exit.1
      
      instead of:
      
      .LBB_foo_3:     ; no_exit.1
              lfdx f2, r6, r9
              add r10, r6, r9
              lfd f3, 8(r10)
              fmul f4, f1, f2
              fmadd f4, f0, f3, f4
              stfd f4, 8(r10)
              fmul f3, f1, f3
              fmsub f2, f0, f2, f3
              stfdx f2, r6, r9
              addi r9, r9, 16
              addi r8, r8, 1
              cmpw cr0, r8, r4
              ble .LBB_foo_3  ; no_exit.1
      
      llvm-svn: 22781
      5949d490
    • Chris Lattner's avatar
      remove dead code. The exit block list is computed on demand, thus does not · 79396539
      Chris Lattner authored
      need to be updated.  This code is a relic from when it did.
      
      llvm-svn: 22775
      79396539
    • Chris Lattner's avatar
      When splitting critical edges, make sure not to leave the new block in the · 8447b495
      Chris Lattner authored
      middle of the loop.  This turns a critical loop in gzip into this:
      
      .LBB_test_1:    ; loopentry
              or r27, r28, r28
              add r28, r3, r27
              lhz r28, 3(r28)
              add r26, r4, r27
              lhz r26, 3(r26)
              cmpw cr0, r28, r26
              bne .LBB_test_8 ; loopentry.loopexit_crit_edge
      .LBB_test_2:    ; shortcirc_next.0
              add r28, r3, r27
              lhz r28, 5(r28)
              add r26, r4, r27
              lhz r26, 5(r26)
              cmpw cr0, r28, r26
              bne .LBB_test_7 ; shortcirc_next.0.loopexit_crit_edge
      .LBB_test_3:    ; shortcirc_next.1
              add r28, r3, r27
              lhz r28, 7(r28)
              add r26, r4, r27
              lhz r26, 7(r26)
              cmpw cr0, r28, r26
              bne .LBB_test_6 ; shortcirc_next.1.loopexit_crit_edge
      .LBB_test_4:    ; shortcirc_next.2
              add r28, r3, r27
              lhz r26, 9(r28)
              add r28, r4, r27
              lhz r25, 9(r28)
              addi r28, r27, 8
              cmpw cr7, r26, r25
              mfcr r26, 1
              rlwinm r26, r26, 31, 31, 31
              add r25, r8, r27
              cmpw cr7, r25, r7
              mfcr r25, 1
              rlwinm r25, r25, 29, 31, 31
              and. r26, r26, r25
              bne .LBB_test_1 ; loopentry
      
      instead of this:
      
      .LBB_test_1:    ; loopentry
              or r27, r28, r28
              add r28, r3, r27
              lhz r28, 3(r28)
              add r26, r4, r27
              lhz r26, 3(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_3 ; shortcirc_next.0
      .LBB_test_2:    ; loopentry.loopexit_crit_edge
              add r2, r30, r27
              add r8, r29, r27
              b .LBB_test_9   ; loopexit
      .LBB_test_3:    ; shortcirc_next.0
              add r28, r3, r27
              lhz r28, 5(r28)
              add r26, r4, r27
              lhz r26, 5(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_5 ; shortcirc_next.1
      .LBB_test_4:    ; shortcirc_next.0.loopexit_crit_edge
              add r2, r11, r27
              add r8, r12, r27
              b .LBB_test_9   ; loopexit
      .LBB_test_5:    ; shortcirc_next.1
              add r28, r3, r27
              lhz r28, 7(r28)
              add r26, r4, r27
              lhz r26, 7(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_7 ; shortcirc_next.2
      .LBB_test_6:    ; shortcirc_next.1.loopexit_crit_edge
              add r2, r9, r27
              add r8, r10, r27
              b .LBB_test_9   ; loopexit
      .LBB_test_7:    ; shortcirc_next.2
              add r28, r3, r27
              lhz r26, 9(r28)
              add r28, r4, r27
              lhz r25, 9(r28)
              addi r28, r27, 8
              cmpw cr7, r26, r25
              mfcr r26, 1
              rlwinm r26, r26, 31, 31, 31
              add r25, r8, r27
              cmpw cr7, r25, r7
              mfcr r25, 1
              rlwinm r25, r25, 29, 31, 31
              and. r26, r26, r25
              bne .LBB_test_1 ; loopentry
      
      Next up, improve the code for the loop.
      
      llvm-svn: 22769
      8447b495
    • Chris Lattner's avatar
      Fix a FIXME: if we are inserting code for a PHI argument, split the critical · 4fec86d3
      Chris Lattner authored
      edge so that the code is not always executed for both operands.  This
      prevents LSR from inserting code into loops whose exit blocks contain
      PHI uses of IV expressions (which are outside of loops).  On gzip, for
      example, we turn this ugly code:
      
      .LBB_test_1:    ; loopentry
              add r27, r3, r28
              lhz r27, 3(r27)
              add r26, r4, r28
              lhz r26, 3(r26)
              add r25, r30, r28    ;; Only live if exiting the loop
              add r24, r29, r28    ;; Only live if exiting the loop
              cmpw cr0, r27, r26
              bne .LBB_test_5 ; loopexit
      
      into this:
      
      .LBB_test_1:    ; loopentry
              or r27, r28, r28
              add r28, r3, r27
              lhz r28, 3(r28)
              add r26, r4, r27
              lhz r26, 3(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_3 ; shortcirc_next.0
      .LBB_test_2:    ; loopentry.loopexit_crit_edge
              add r2, r30, r27
              add r8, r29, r27
              b .LBB_test_9   ; loopexit
      .LBB_test_2:    ; shortcirc_next.0
              ...
              blt .LBB_test_1
      
      
      into this:
      
      .LBB_test_1:    ; loopentry
              or r27, r28, r28
              add r28, r3, r27
              lhz r28, 3(r28)
              add r26, r4, r27
              lhz r26, 3(r26)
              cmpw cr0, r28, r26
              beq .LBB_test_3 ; shortcirc_next.0
      .LBB_test_2:    ; loopentry.loopexit_crit_edge
              add r2, r30, r27
              add r8, r29, r27
              b .LBB_t_3:    ; shortcirc_next.0
      .LBB_test_3:    ; shortcirc_next.0
              ...
              blt .LBB_test_1
      
      
      Next step: get the block out of the loop so that the loop is all
      fall-throughs again.
      
      llvm-svn: 22766
      4fec86d3
  13. Aug 10, 2005
    • Chris Lattner's avatar
      62df7989
    • Chris Lattner's avatar
      Make loop-simplify produce better loops by turning PHI nodes like X = phi [X, Y] · f83ce5fa
      Chris Lattner authored
      into just Y.  This often occurs when it seperates loops that have collapsed loop
      headers.  This implements LoopSimplify/phi-node-simplify.ll
      
      llvm-svn: 22746
      f83ce5fa
    • Chris Lattner's avatar
      Allow indvar simplify to canonicalize ANY affine IV, not just affine IVs with · 677d8578
      Chris Lattner authored
      constant stride.  This implements Transforms/IndVarsSimplify/variable-stride-ivs.ll
      
      llvm-svn: 22744
      677d8578
    • Chris Lattner's avatar
      Teach LSR to strength reduce IVs that have a loop-invariant but non-constant stride. · edff91a4
      Chris Lattner authored
      For code like this:
      
      void foo(float *a, float *b, int n, int stride_a, int stride_b) {
        int i;
        for (i=0; i<n; i++)
            a[i*stride_a] = b[i*stride_b];
      }
      
      we now emit:
      
      .LBB_foo2_2:    ; no_exit
              lfs f0, 0(r4)
              stfs f0, 0(r3)
              addi r7, r7, 1
              add r4, r2, r4
              add r3, r6, r3
              cmpw cr0, r7, r5
              blt .LBB_foo2_2 ; no_exit
      
      instead of:
      
      .LBB_foo_2:     ; no_exit
              mullw r8, r2, r7     ;; multiply!
              slwi r8, r8, 2
              lfsx f0, r4, r8
              mullw r8, r2, r6     ;; multiply!
              slwi r8, r8, 2
              stfsx f0, r3, r8
              addi r2, r2, 1
              cmpw cr0, r2, r5
              blt .LBB_foo_2  ; no_exit
      
      loops with variable strides occur pretty often.  For example, in SPECFP2K
      there are 317 variable strides in 177.mesa, 3 in 179.art, 14 in 188.ammp,
      56 in 168.wupwise, 36 in 172.mgrid.
      
      Now we can allow indvars to turn functions written like this:
      
      void foo2(float *a, float *b, int n, int stride_a, int stride_b) {
        int i, ai = 0, bi = 0;
        for (i=0; i<n; i++)
          {
            a[ai] = b[bi];
            ai += stride_a;
            bi += stride_b;
          }
      }
      
      into code like the above for better analysis.  With this patch, they generate
      identical code.
      
      llvm-svn: 22740
      edff91a4
    • Chris Lattner's avatar
      Fix Regression/Transforms/LoopStrengthReduce/phi_node_update_multiple_preds.ll · dde7dc52
      Chris Lattner authored
      by being more careful about updating PHI nodes
      
      llvm-svn: 22739
      dde7dc52
    • Chris Lattner's avatar
      Fix some 80 column violations. · c6c4d99a
      Chris Lattner authored
      Once we compute the evolution for a GEP, tell SE about it.  This allows users
      of the GEP to know it, if the users are not direct.  This allows us to compile
      this testcase:
      
      void fbSolidFillmmx(int w, unsigned char *d) {
          while (w >= 64) {
              *(unsigned long long *) (d +  0) = 0;
              *(unsigned long long *) (d +  8) = 0;
              *(unsigned long long *) (d + 16) = 0;
              *(unsigned long long *) (d + 24) = 0;
              *(unsigned long long *) (d + 32) = 0;
              *(unsigned long long *) (d + 40) = 0;
              *(unsigned long long *) (d + 48) = 0;
              *(unsigned long long *) (d + 56) = 0;
              w -= 64;
              d += 64;
          }
      }
      
      into:
      
      .LBB_fbSolidFillmmx_2:  ; no_exit
              li r2, 0
              stw r2, 0(r4)
              stw r2, 4(r4)
              stw r2, 8(r4)
              stw r2, 12(r4)
              stw r2, 16(r4)
              stw r2, 20(r4)
              stw r2, 24(r4)
              stw r2, 28(r4)
              stw r2, 32(r4)
              stw r2, 36(r4)
              stw r2, 40(r4)
              stw r2, 44(r4)
              stw r2, 48(r4)
              stw r2, 52(r4)
              stw r2, 56(r4)
              stw r2, 60(r4)
              addi r4, r4, 64
              addi r3, r3, -64
              cmpwi cr0, r3, 63
              bgt .LBB_fbSolidFillmmx_2       ; no_exit
      
      instead of:
      
      .LBB_fbSolidFillmmx_2:  ; no_exit
              li r11, 0
              stw r11, 0(r4)
              stw r11, 4(r4)
              stwx r11, r10, r4
              add r12, r10, r4
              stw r11, 4(r12)
              stwx r11, r9, r4
              add r12, r9, r4
              stw r11, 4(r12)
              stwx r11, r8, r4
              add r12, r8, r4
              stw r11, 4(r12)
              stwx r11, r7, r4
              add r12, r7, r4
              stw r11, 4(r12)
              stwx r11, r6, r4
              add r12, r6, r4
              stw r11, 4(r12)
              stwx r11, r5, r4
              add r12, r5, r4
              stw r11, 4(r12)
              stwx r11, r2, r4
              add r12, r2, r4
              stw r11, 4(r12)
              addi r4, r4, 64
              addi r3, r3, -64
              cmpwi cr0, r3, 63
              bgt .LBB_fbSolidFillmmx_2       ; no_exit
      
      llvm-svn: 22737
      c6c4d99a
  14. Aug 09, 2005
Loading