README.txt

it could be:

define i32 @test__(i32 %a, i32 %b) nounwind readnone ssp {
entry:
  %0 = icmp sle i32 %a, %b
  %retval = zext i1 %0 to i32
  ret i32 %retval
}

//===---------------------------------------------------------------------===//

This code can be seen in viterbi:

  %64 = call noalias i8* @malloc(i64 %62) nounwind
...
  %67 = call i64 @llvm.objectsize.i64(i8* %64, i1 false) nounwind
  %68 = call i8* @__memset_chk(i8* %64, i32 0, i64 %62, i64 %67) nounwind

llvm.objectsize.i64 should be taught about malloc/calloc, allowing it to
fold to %62.  This is a security win (overflows of malloc will get caught)
and also a performance win by exposing more memsets to the optimizer.

This occurs several times in viterbi.

Note that this would change the semantics of @llvm.objectsize which by its
current definition always folds to a constant. We also should make sure that
we remove checking in code like

  char *p = malloc(strlen(s)+1);
  __strcpy_chk(p, s, __builtin_objectsize(p, 0));

//===---------------------------------------------------------------------===//

This code (from Benchmarks/Dhrystone/dry.c):

define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
entry:
  %sext = shl i32 %0, 24
  %conv = ashr i32 %sext, 24
  %sext6 = shl i32 %1, 24
  %conv4 = ashr i32 %sext6, 24
  %cmp = icmp eq i32 %conv, %conv4
  %. = select i1 %cmp, i32 10000, i32 0
  ret i32 %.
}

Should be simplified into something like:

define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
entry:
  %sext = shl i32 %0, 24
  %conv = and i32 %sext, 0xFF000000
  %sext6 = shl i32 %1, 24
  %conv4 = and i32 %sext6, 0xFF000000
  %cmp = icmp eq i32 %conv, %conv4
  %. = select i1 %cmp, i32 10000, i32 0
  ret i32 %.
}

and then to:

define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
entry:
  %conv = and i32 %0, 0xFF
  %conv4 = and i32 %1, 0xFF
  %cmp = icmp eq i32 %conv, %conv4
  %. = select i1 %cmp, i32 10000, i32 0
  ret i32 %.
}
//===---------------------------------------------------------------------===//

clang -O3 currently compiles this code

int g(unsigned int a) {
  unsigned int c[100];
  c[10] = a;
  c[11] = a;
  unsigned int b = c[10] + c[11];
  if(b > a*2) a = 4;
  else a = 8;
  return a + 7;
}

into

define i32 @g(i32 a) nounwind readnone {
  %add = shl i32 %a, 1
  %mul = shl i32 %a, 1
  %cmp = icmp ugt i32 %add, %mul
  %a.addr.0 = select i1 %cmp, i32 11, i32 15
  ret i32 %a.addr.0
}

The icmp should fold to false. This CSE opportunity is only available
after GVN and InstCombine have run.

//===---------------------------------------------------------------------===//

memcpyopt should turn this:

define i8* @test10(i32 %x) {
  %alloc = call noalias i8* @malloc(i32 %x) nounwind
  call void @llvm.memset.p0i8.i32(i8* %alloc, i8 0, i32 %x, i32 1, i1 false)
  ret i8* %alloc
}

into a call to calloc.  We should make sure that we analyze calloc as
aggressively as malloc though.

//===---------------------------------------------------------------------===//

clang -03 currently compiles this code

void f1(int* begin, int* end) {
  std::fill(begin, end, 0);
}

into

define void @_Z2f1PiS_(i32* %begin, i32* %end) nounwind {
entry:
  %cmp7.i.i = icmp eq i32* %begin, %end
  br i1 %cmp7.i.i, label %_ZSt4fillIPiiEvT_S1_RKT0_.exit, label %for.body.i.i

for.body.i.i:                                     ; preds = %entry, %for.body.i.i
  %indvar.i.i = phi i64 [ %tmp, %for.body.i.i ], [ 0, %entry ]
  %tmp = add i64 %indvar.i.i, 1
  %ptrincdec.i.i = getelementptr i32* %begin, i64 %tmp
  %__first.addr.08.i.i = getelementptr i32* %begin, i64 %indvar.i.i
  store i32 0, i32* %__first.addr.08.i.i, align 4, !tbaa !0
  %cmp.i.i = icmp eq i32* %ptrincdec.i.i, %end
  br i1 %cmp.i.i, label %_ZSt4fillIPiiEvT_S1_RKT0_.exit, label %for.body.i.i

_ZSt4fillIPiiEvT_S1_RKT0_.exit:                   ; preds = %for.body.i.i, %entry
  ret void
}

It should compile it to a memset.

//===---------------------------------------------------------------------===//

clang -O3 -fno-exceptions currently compiles this code:

void f(int N) {
  std::vector<int> v(N);

  extern void sink(void*); sink(&v);
}

into

define void @_Z1fi(i32 %N) nounwind {
entry:
  %v2 = alloca [3 x i32*], align 8
  %v2.sub = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 0
  %tmpcast = bitcast [3 x i32*]* %v2 to %"class.std::vector"*
  %conv = sext i32 %N to i64
  store i32* null, i32** %v2.sub, align 8, !tbaa !0
  %tmp3.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 1
  store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
  store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  %cmp.i.i.i.i = icmp eq i32 %N, 0
  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i

_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
  store i32* null, i32** %v2.sub, align 8, !tbaa !0
  store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
  store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit

cond.true.i.i.i.i:                                ; preds = %entry
  %cmp.i.i.i.i.i = icmp slt i32 %N, 0
  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i

if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
  call void @_ZSt17__throw_bad_allocv() noreturn nounwind
  unreachable

_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i:    ; preds = %cond.true.i.i.i.i
  %mul.i.i.i.i.i = shl i64 %conv, 2
  %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
  %0 = bitcast i8* %call3.i.i.i.i.i to i32*
  store i32* %0, i32** %v2.sub, align 8, !tbaa !0
  store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
  store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
  br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit

This is just the handling the construction of the vector. Most surprising here
is the fact that all three null stores in %entry are dead, but not eliminated.
Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.

//===---------------------------------------------------------------------===//

clang -O3 -fno-exceptions currently compiles this code:

void f(int N) {
  std::vector<int> v(N);
  for (int k = 0; k < N; ++k)
    v[k] = 0;

  extern void sink(void*); sink(&v);
}

into almost the same as the previous note, but replace its final BB with:

for.body.lr.ph:                                   ; preds = %cond.true.i.i.i.i
  %mul.i.i.i.i.i = shl i64 %conv, 2
  %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
  %0 = bitcast i8* %call3.i.i.i.i.i to i32*
  store i32* %0, i32** %v8.sub, align 8, !tbaa !0
  %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
  store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
  store i32* %add.ptr.i.i.i, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %tmp18 = add i32 %N, -1
  %tmp19 = zext i32 %tmp18 to i64
  %tmp20 = shl i64 %tmp19, 2
  %tmp21 = add i64 %tmp20, 4
  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %tmp21, i32 4, i1 false)
  br label %for.end

First off, why (((zext %N - 1) << 2) + 4) instead of the ((sext %N) << 2) done
previously? (or better yet, re-use that one?)

Then, the really painful one is the second memset, of the same memory, to the
same value.

//===---------------------------------------------------------------------===//

clang -O3 -fno-exceptions currently compiles this code:

struct S {
  unsigned short m1, m2;
  unsigned char m3, m4;
};

void f(int N) {
  std::vector<S> v(N);
  extern void sink(void*); sink(&v);
}

into poor code for zero-initializing 'v' when N is >0. The problem is that
S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
4 stores on each iteration. If the struct were 8 bytes, this gets turned into
a memset.

//===---------------------------------------------------------------------===//

clang -O3 currently compiles this code:

extern const int magic;
double f() { return 0.0 * magic; }

into

@magic = external constant i32

define double @_Z1fv() nounwind readnone {
entry:
  %tmp = load i32* @magic, align 4, !tbaa !0
  %conv = sitofp i32 %tmp to double
  %mul = fmul double %conv, 0.000000e+00
  ret double %mul
}

We should be able to fold away this fmul to a constant, there is no 32-bit
integer which after sitofp will generate a NaN, inf, or -0.0. We should fold
this whenever the floating point type has enough exponent bits to represent
the largest integer value as < inf.

//===---------------------------------------------------------------------===//