README.txt

  %v2.sub = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 0
  %tmpcast = bitcast [3 x i32*]* %v2 to %"class.std::vector"*
  %conv = sext i32 %N to i64
  store i32* null, i32** %v2.sub, align 8, !tbaa !0
  %tmp3.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 1
  store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
  store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  %cmp.i.i.i.i = icmp eq i32 %N, 0
  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i

_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
  store i32* null, i32** %v2.sub, align 8, !tbaa !0
  store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
  store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit

cond.true.i.i.i.i:                                ; preds = %entry
  %cmp.i.i.i.i.i = icmp slt i32 %N, 0
  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i

if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
  call void @_ZSt17__throw_bad_allocv() noreturn nounwind
  unreachable

_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i:    ; preds = %cond.true.i.i.i.i
  %mul.i.i.i.i.i = shl i64 %conv, 2
  %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
  %0 = bitcast i8* %call3.i.i.i.i.i to i32*
  store i32* %0, i32** %v2.sub, align 8, !tbaa !0
  store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
  store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
  br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit

This is just the handling the construction of the vector. Most surprising here
is the fact that all three null stores in %entry are dead (because we do no
cross-block DSE).

Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.
This is a because the client of LazyValueInfo doesn't simplify all instruction
operands, just selected ones.

//===---------------------------------------------------------------------===//

clang -O3 -fno-exceptions currently compiles this code:

void f(char* a, int n) {
  __builtin_memset(a, 0, n);
  for (int i = 0; i < n; ++i)
    a[i] = 0;
}

into:

define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind {
entry:
  %conv = sext i32 %n to i64
  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false)
  %cmp8 = icmp sgt i32 %n, 0
  br i1 %cmp8, label %for.body.lr.ph, label %for.end

for.body.lr.ph:                                   ; preds = %entry
  %tmp10 = add i32 %n, -1
  %tmp11 = zext i32 %tmp10 to i64
  %tmp12 = add i64 %tmp11, 1
  call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false)
  ret void

for.end:                                          ; preds = %entry
  ret void
}

This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
the two memset's together. The issue with %n seems to stem from poor handling
of the original loop.

To simplify this, we need SCEV to know that "n != 0" because of the dominating
conditional.  That would turn the second memset into a simple memset of 'n'.

//===---------------------------------------------------------------------===//

clang -O3 -fno-exceptions currently compiles this code:

struct S {
  unsigned short m1, m2;
  unsigned char m3, m4;
};

void f(int N) {
  std::vector<S> v(N);
  extern void sink(void*); sink(&v);
}

into poor code for zero-initializing 'v' when N is >0. The problem is that
S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
4 stores on each iteration. If the struct were 8 bytes, this gets turned into
a memset.

In order to handle this we have to:
  A) Teach clang to generate metadata for memsets of structs that have holes in
     them.
  B) Teach clang to use such a memset for zero init of this struct (since it has
     a hole), instead of doing elementwise zeroing.

//===---------------------------------------------------------------------===//

clang -O3 currently compiles this code:

extern const int magic;
double f() { return 0.0 * magic; }

into

@magic = external constant i32

define double @_Z1fv() nounwind readnone {
entry:
  %tmp = load i32* @magic, align 4, !tbaa !0
  %conv = sitofp i32 %tmp to double
  %mul = fmul double %conv, 0.000000e+00
  ret double %mul
}

We should be able to fold away this fmul to 0.0.  More generally, fmul(x,0.0)
can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
not an INF.  The CannotBeNegativeZero predicate in value tracking should be
extended to support general "fpclassify" operations that can return 
yes/no/unknown for each of these predicates.

In this predicate, we know that uitofp is trivially never NaN or -0.0, and
we know that it isn't +/-Inf if the floating point type has enough exponent bits
to represent the largest integer value as < inf.

//===---------------------------------------------------------------------===//

When optimizing a transformation that can change the sign of 0.0 (such as the
0.0*val -> 0.0 transformation above), it might be provable that the sign of the
expression doesn't matter.  For example, by the above rules, we can't transform
fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
expression is defined to be -0.0.

If we look at the uses of the fmul for example, we might be able to prove that
all uses don't care about the sign of zero.  For example, if we have:

  fadd(fmul(sitofp(x), 0.0), 2.0)

Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
transform the fmul to 0.0, and then the fadd to 2.0.

//===---------------------------------------------------------------------===//

We should enhance memcpy/memcpy/memset to allow a metadata node on them
indicating that some bytes of the transfer are undefined.  This is useful for
frontends like clang when lowering struct copies, when some elements of the
struct are undefined.  Consider something like this:

struct x {
  char a;
  int b[4];
};
void foo(struct x*P);
struct x testfunc() {
  struct x V1, V2;
  foo(&V1);
  V2 = V1;

  return V2;
}

We currently compile this to:
$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S


%struct.x = type { i8, [4 x i32] }

define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
entry:
  %V1 = alloca %struct.x, align 4
  call void @foo(%struct.x* %V1)
  %tmp1 = bitcast %struct.x* %V1 to i8*
  %0 = bitcast %struct.x* %V1 to i160*
  %srcval1 = load i160* %0, align 4
  %tmp2 = bitcast %struct.x* %agg.result to i8*
  %1 = bitcast %struct.x* %agg.result to i160*
  store i160 %srcval1, i160* %1, align 4
  ret void
}

This happens because SRoA sees that the temp alloca has is being memcpy'd into
and out of and it has holes and it has to be conservative.  If we knew about the
holes, then this could be much much better.

Having information about these holes would also improve memcpy (etc) lowering at
llc time when it gets inlined, because we can use smaller transfers.  This also
avoids partial register stalls in some important cases.

//===---------------------------------------------------------------------===//

We don't fold (icmp (add) (add)) unless the two adds only have a single use.
There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for
example:

 %indvar.next90 = add i64 %indvar89, 1     ;; Has 2 uses
 %tmp96 = add i64 %tmp95, 1                ;; Has 1 use
 %exitcond97 = icmp eq i64 %indvar.next90, %tmp96

We don't fold this because we don't want to introduce an overlapped live range
of the ivar.  However if we can make this more aggressive without causing
performance issues in two ways:

1. If *either* the LHS or RHS has a single use, we can definitely do the
   transformation.  In the overlapping liverange case we're trading one register
   use for one fewer operation, which is a reasonable trade.  Before doing this
   we should verify that the llc output actually shrinks for some benchmarks.
2. If both ops have multiple uses, we can still fold it if the operations are
   both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't
   increase register pressure.

There are a ton of icmp's we aren't simplifying because of the reg pressure
concern.  Care is warranted here though because many of these are induction
variables and other cases that matter a lot to performance, like the above.
Here's a blob of code that you can drop into the bottom of visitICmp to see some
missed cases:

  { Value *A, *B, *C, *D;
    if (match(Op0, m_Add(m_Value(A), m_Value(B))) && 
        match(Op1, m_Add(m_Value(C), m_Value(D))) &&
        (A == C || A == D || B == C || B == D)) {
      errs() << "OP0 = " << *Op0 << "  U=" << Op0->getNumUses() << "\n";
      errs() << "OP1 = " << *Op1 << "  U=" << Op1->getNumUses() << "\n";
      errs() << "CMP = " << I << "\n\n";
    }
  }

//===---------------------------------------------------------------------===//