Skip to content
  • Chris Lattner's avatar
    When loop rotation happens, it is *very* common for the duplicated condbr · 59c82f85
    Chris Lattner authored
    to be foldable into an uncond branch.  When this happens, we can make a
    much simpler CFG for the loop, which is important for nested loop cases
    where we want the outer loop to be aggressively optimized.
    
    Handle this case more aggressively.  For example, previously on
    phi-duplicate.ll we would get this:
    
    
    define void @test(i32 %N, double* %G) nounwind ssp {
    entry:
      %cmp1 = icmp slt i64 1, 1000
      br i1 %cmp1, label %bb.nph, label %for.end
    
    bb.nph:                                           ; preds = %entry
      br label %for.body
    
    for.body:                                         ; preds = %bb.nph, %for.cond
      %j.02 = phi i64 [ 1, %bb.nph ], [ %inc, %for.cond ]
      %arrayidx = getelementptr inbounds double* %G, i64 %j.02
      %tmp3 = load double* %arrayidx
      %sub = sub i64 %j.02, 1
      %arrayidx6 = getelementptr inbounds double* %G, i64 %sub
      %tmp7 = load double* %arrayidx6
      %add = fadd double %tmp3, %tmp7
      %arrayidx10 = getelementptr inbounds double* %G, i64 %j.02
      store double %add, double* %arrayidx10
      %inc = add nsw i64 %j.02, 1
      br label %for.cond
    
    for.cond:                                         ; preds = %for.body
      %cmp = icmp slt i64 %inc, 1000
      br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge
    
    for.cond.for.end_crit_edge:                       ; preds = %for.cond
      br label %for.end
    
    for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry
      ret void
    }
    
    Now we get the much nicer:
    
    define void @test(i32 %N, double* %G) nounwind ssp {
    entry:
      br label %for.body
    
    for.body:                                         ; preds = %entry, %for.body
      %j.01 = phi i64 [ 1, %entry ], [ %inc, %for.body ]
      %arrayidx = getelementptr inbounds double* %G, i64 %j.01
      %tmp3 = load double* %arrayidx
      %sub = sub i64 %j.01, 1
      %arrayidx6 = getelementptr inbounds double* %G, i64 %sub
      %tmp7 = load double* %arrayidx6
      %add = fadd double %tmp3, %tmp7
      %arrayidx10 = getelementptr inbounds double* %G, i64 %j.01
      store double %add, double* %arrayidx10
      %inc = add nsw i64 %j.01, 1
      %cmp = icmp slt i64 %inc, 1000
      br i1 %cmp, label %for.body, label %for.end
    
    for.end:                                          ; preds = %for.body
      ret void
    }
    
    With all of these recent changes, we are now able to compile:
    
    void foo(char *X) {
     for (int i = 0; i != 100; ++i) 
       for (int j = 0; j != 100; ++j)
         X[j+i*100] = 0;
    }
    
    into a single memset of 10000 bytes.  This series of changes
    should also be helpful for other nested loop scenarios as well.
    
    llvm-svn: 123079
    59c82f85
Loading